# -*- coding: utf-8 -*-
import os.path as op
import gzip
import string
import logging
import urllib.request
import re
from functools import wraps
from tempfile import NamedTemporaryFile
from collections import defaultdict, OrderedDict
import six
from fastcache import clru_cache
import numpy as np
import Bio
from Bio.PDB import PDBIO, Select, NeighborSearch
from Bio.PDB.PDBParser import PDBParser
from Bio.PDB.MMCIFParser import MMCIFParser
from Bio.PDB.Polypeptide import PPBuilder
from Bio.Alphabet import IUPAC
from Bio.Seq import Seq
from . import errors, helper
logger = logging.getLogger(__name__)
A_DICT = {
'A': 'ALA', 'R': 'ARG', 'N': 'ASN', 'D': 'ASP', 'C': 'CYS', 'E': 'GLU',
'Q': 'GLN', 'G': 'GLY', 'H': 'HIS', 'I': 'ILE', 'L': 'LEU', 'K': 'LYS',
'M': 'MET', 'F': 'PHE', 'P': 'PRO', 'S': 'SER', 'T': 'THR', 'W': 'TRP',
'Y': 'TYR', 'V': 'VAL', 'U': 'SEC', 'O': 'PYL',
'B': 'ASX', 'Z': 'GLX', 'J': 'XLE', 'X': 'XAA', '*': 'TER'
}
AAA_DICT = dict([(value, key) for key, value in list(A_DICT.items())])
AAA_DICT['UNK'] = 'X'
AAA_DICT['MSE'] = 'M'
AAA_DICT['CSD'] = 'C'
# Phosphorylated residues
# AAA_DICT['SEP'] = 'S' # PHOSPHOSERINE
# AAA_DICT['TPO'] = 'T' # PHOSPHOTHREONINE
# AAA_DICT['SEP'] = 'Y' # O-PHOSPHOTYROSINE
# Methylated lysines
AAA_DICT['MLZ'] = 'K'
AAA_DICT['MLY'] = 'K'
AAA_DICT['M3L'] = 'K'
AMINO_ACIDS = list(AAA_DICT.keys())
METHYLATED_LYSINES = ['MLZ', 'MLY', 'M3L']
LYSINE_ATOMS = ['N', 'CA', 'CB', 'CG', 'CD', 'CE', 'NZ', 'C', 'O']
# %% Functions for downloading and parsing pdb files
class MMCIFParserMod(MMCIFParser):
def __init__(self, temp_dir):
self.temp_dir = temp_dir
[docs] def get_structure(self, structure_id, gzip_fh):
"""
Altered `get_structure` method which accepts gzip file handles as input.
"""
with NamedTemporaryFile(mode='w', dir=self.temp_dir) as temp_fh:
temp_fh.writelines(gzip_fh.readlines())
temp_fh.flush()
temp_fh.seek(0)
return super(MMCIFParserMod, self).get_structure(structure_id, temp_fh.name)
[docs]def get_pdb_id(pdb_file):
pdb_id = op.basename(pdb_file)
for ext in ['\.gz$', '\.pdb$', '\.ent$', '\.cif$', '^pdb']:
pdb_id = re.sub(ext, '', pdb_id)
pdb_id = pdb_id.upper()
return pdb_id.upper()
[docs]def get_pdb_file(pdb_id, pdb_database_dir, pdb_type='ent'):
"""Get PDB file from a local mirror of the PDB database.
"""
if pdb_type == 'ent':
# Original PDB structure.
prefix = 'pdb'
suffix = '.ent.gz'
relative_pdb_file = (
pdb_id[1:3].lower() + '/' + prefix + pdb_id.lower() + suffix
)
elif pdb_type == 'cif':
# mmCIF pdb structure.
prefix = ''
suffix = '.cif.gz'
relative_pdb_file = (
'../mmCIF/' +
pdb_id[1:3].lower() + '/' + prefix + pdb_id.lower() + suffix
)
elif pdb_type == 'pdb':
# The first biological unit.
prefix = ''
suffix = '.pdb1.gz'
relative_pdb_file = (
'../../../biounit/coordinates/divided/' +
pdb_id[1:3].lower() + '/' + prefix + pdb_id.lower() + suffix
)
elif pdb_type == 'raw':
# Just a PDB file in some folder.
relative_pdb_file = ''
else:
raise Exception
pdb_file = op.join(pdb_database_dir, relative_pdb_file)
return pdb_file
[docs]def download_pdb_file(pdb_id, output_dir):
"""Move PDB structure to the local working directory.
"""
PDB_URL = 'http://www.rcsb.org/pdb/files/{}.pdb'
PDB_EURO_URL = 'http://www.ebi.ac.uk/pdbe/entry-files/download/pdb{}.ent'
output_pdb_filename = op.join(output_dir, pdb_id + '.pdb')
# If the PDB already exists, do nothing...
if op.isfile(output_pdb_filename):
logger.debug('PDB file {} already exists...'.format(output_pdb_filename))
return output_pdb_filename
# Download the PDB file from the internet...
logger.info('Downloading PDB {}...'.format(pdb_id + '.pdb'))
try:
pdb_url = PDB_URL.format(pdb_id)
logger.debug(pdb_url)
response = urllib.request.urlopen(pdb_url)
except urllib.error.URLError as e:
logger.debug(e)
pdb_url = PDB_EURO_URL.format(pdb_id.lower())
logger.debug(pdb_url)
response = urllib.request.urlopen(pdb_url)
with open(output_pdb_filename, 'wb') as ofh:
ofh.write(response.read())
return output_pdb_filename
[docs]def get_pdb_parser(pdb_type, temp_dir='/tmp'):
"""Get PDB parser that can work with structures of the specified type.
"""
if pdb_type in {'ent', 'pdb', 'raw'}:
parser = PDBParser(QUIET=True)
elif pdb_type in {'cif'}:
parser = MMCIFParserMod(temp_dir=temp_dir)
else:
raise Exception('Unsupported ``pdb_type`` {}'.format(pdb_type))
return parser
@clru_cache(maxsize=128, typed=False)
[docs]def get_pdb(pdb_id, pdb_path, temp_dir='/tmp', pdb_type='ent', use_external=True):
"""
Parse a pdb file with biopythons PDBParser() and return the structure.
Parameters
----------
pdb_code : str
Four letter code of the PDB file
pdb_path : str
Biopython pdb structure
temp_dir : str, optional, default='/tmp/'
Path to the folder for storing temporary files
pdb_type : 'ent'/'pdb'/'cif', optional, default='ent'
The extension of the pdb to use
Raises
------
PDBNotFoundError
If the pdb file could not be retrieved from the local (and remote) databases
"""
pdb_file = get_pdb_file(pdb_id, pdb_path, pdb_type)
pdb_parser = get_pdb_parser(pdb_type, temp_dir)
try:
structure = pdb_parser.get_structure(pdb_id, gzip.open(pdb_file, 'rt'))
except IOError:
error_message = (
'PDB not found! (pdb_id: {}, pdb_path: {}, pdb_type: {})'
.format(pdb_id, pdb_path, pdb_type)
)
if not use_external:
raise errors.PDBNotFoundError(error_message)
print('Retrieving pdb from the wwpdb ftp server...')
temp_filename = download_pdb_file(pdb_id, temp_dir)
structure = pdb_parser.get_structure(pdb_id, gzip.open(temp_filename, 'rt'))
return structure
[docs]def get_pdb_structure(pdb_file, pdb_id=None):
"""Set QUIET to False to output warnings like incomplete chains etc.
"""
logger.debug('pdb_file: {}'.format(pdb_file))
logger.debug('pdb_id: {}'.format(pdb_id))
if pdb_id is None:
pdb_id = get_pdb_id(pdb_file)
parser = PDBParser(get_header=True, QUIET=True)
if pdb_file.endswith('.gz'):
with gzip.open(pdb_file, 'rt') as ifh:
structure = parser.get_structure(pdb_id, ifh)
else:
structure = parser.get_structure(pdb_id, pdb_file)
return structure
# %%
[docs]def euclidean_distance(a, b):
"""Calculate the Euclidean distance between two lists or tuples of arbitrary length.
"""
return np.sqrt(sum((a - b)**2 for a, b in zip(a, b)))
[docs]def calculate_distance(atom_1, atom_2, cutoff=None):
"""Calculate the distance between two points in 3D space.
Parameters
----------
cutoff : float, optional
The maximum distance allowable between two points.
"""
if ((type(atom_1) == type(atom_2) == list) or
(type(atom_1) == type(atom_2) == tuple)):
a = atom_1
b = atom_2
elif hasattr(atom_1, 'coord') and hasattr(atom_2, 'coord'):
a = atom_1.coord
b = atom_2.coord
else:
raise Exception('Unsupported format {} {}'.format(type(atom_1), type(atom_2)))
assert(len(a) == 3 and len(b) == 3)
if cutoff is None or all(abs(p - q) <= cutoff for p, q in zip(a, b)):
return euclidean_distance(a, b)
# %%
[docs]def get_chain_seqres_sequence(chain, aa_only=False):
"""Get the amino acid sequence for the construct coding for the given chain.
Extracts a sequence from a PDB file. Usefull when interested in the
sequence that was used for crystallization and not the ATOM sequence.
Parameters
----------
aa_only : bool
If aa_only is set to `False`, selenomethionines will be included in the sequence.
See: http://biopython.org/DIST/docs/api/Bio.PDB.Polypeptide-module.html.
"""
sequence = Seq('', IUPAC.protein)
for pb in PPBuilder().build_peptides(chain, aa_only=aa_only):
sequence += sequence + pb.get_sequence()
return sequence
[docs]def get_chain_sequence_and_numbering(chain, domain_def_tuple=None, include_hetatms=False):
"""Get the amino acid sequence and a list of residue ids for the given chain.
Parameters
----------
chain : Bio.PDB.Chain.Chain
The chain for which to get the amino acid sequence and numbering.
"""
if domain_def_tuple is not None:
start_resid, end_resid = domain_def_tuple
chain_numbering = []
chain_numbering_extended = []
chain_sequence = []
inside_domain = False
for res in chain:
#
resid = str(res.id[1]) + res.id[2].strip()
if domain_def_tuple is None or resid == start_resid:
inside_domain = True
if inside_domain and (include_hetatms or res.resname in AMINO_ACIDS):
chain_numbering.append(res.id[1])
chain_numbering_extended.append(resid)
chain_sequence.append(AAA_DICT.get(res.resname, '.'))
if domain_def_tuple is not None and resid == end_resid:
inside_domain = False
chain_sequence = ''.join(chain_sequence)
return chain_sequence, chain_numbering_extended
[docs]def convert_position_to_resid(chain, positions, domain_def_tuple=None):
"""Convert mutation_domain to mutation_modeller.
In mutation_modeller, the first amino acid in a chain may start
with something other than 1.
"""
__, chain_numbering = get_chain_sequence_and_numbering(
chain, domain_def_tuple)
logger.debug('chain_numbering: {}'.format(chain_numbering))
logger.debug('positions: {}'.format(positions))
return [chain_numbering[p-1] for p in positions]
[docs]def get_structure_sequences(file_or_structure, seqres_sequence=False):
"""
Convenience function returining a dictionary of sequences for a given file
or a Biopython Structure, Model or Chain.
"""
if isinstance(file_or_structure, six.string_types):
structure = get_pdb_structure(file_or_structure)
model = structure[0]
elif isinstance(file_or_structure, Bio.PDB.Structure.Structure):
model = file_or_structure[0]
elif isinstance(file_or_structure, Bio.PDB.Model.Model):
model = file_or_structure
elif isinstance(file_or_structure, Bio.PDB.Chain.Chain):
model = [file_or_structure]
else:
raise Exception(
'Unexpected type {} for input ``file_or_structure`` {}!'
.format(file_or_structure, type(file_or_structure)))
chain_sequences = defaultdict(list)
for chain in model:
if seqres_sequence:
chain_sequence = get_chain_seqres_sequence(chain)
else:
chain_sequence, __ = get_chain_sequence_and_numbering(chain)
chain_sequences[chain.id] = chain_sequence
return chain_sequences
# %%
[docs]def suppress_logger(fn):
@wraps(fn)
def fn_quiet(*args, **kwargs):
level = logger.level
logger.setLevel(logging.WARNING)
try:
return fn(*args, **kwargs)
finally:
logger.setLevel(level)
return fn_quiet
[docs]def convert_aa(aa, quiet=False):
"""Convert amino acids from three letter code to one letter code or vice versa
.. note:: Deprecated!
Use ``''.join(AAA_DICT[aaa] for aaa in aa)`` and ``''.join(A_DICT[a] for a in aa)``.
"""
if quiet:
return suppress_logger(convert_aa)(aa)
if len(aa) == 3:
try:
return AAA_DICT[aa.upper()]
except KeyError:
if not quiet:
logger.debug('Not a valid amino acid: {}'.format(aa))
return
if len(aa) == 1:
try:
return A_DICT[aa.upper()]
except KeyError:
if not quiet:
logger.debug('Not a valid amino acid: {}'.format(aa))
return
if not quiet:
logger.debug('Not a valid amino acid: {}'.format(aa))
[docs]def convert_resnum_alphanumeric_to_numeric(resnum):
"""
Convert residue numbering that has letters (i.e. 1A, 1B, 1C...) to
residue numbering without letters (i.e. 1, 2, 3...).
.. note:: Deprecated!
Use ``get_chain_sequence_and_numbering()``.
"""
idx_increment = 0
while string.ascii_letters.find(resnum[-1]) != -1:
idx_increment += string.ascii_letters.find(resnum[-1])
resnum = resnum[:-1]
resnum = int(resnum) + idx_increment
return resnum
# %% STANDALONE FUNCTIONS
[docs]def get_interactions(model, chain_id, r_cutoff=6):
"""
"""
interactions = {}
for chain_id_2, chain_2 in model.child_dict.items():
if chain_id == chain_id_2:
continue
interactions[chain_id_2] = (
get_interactions_between_chains(model, chain_id, chain_id_2, r_cutoff)
)
return {k: v for (k, v) in interactions.items() if v}
[docs]def get_interactions_between_chains(model, chain_id_1, chain_id_2, r_cutoff=6):
"""
Calculate interactions between residues in pdb_chain_1 and pdb_chain_2. An
interaction is defines as a pair of residues where at least one pair of atom
is closer than r_cutoff. The default value for r_cutoff is 5 Angstroms.
.. deprecated:: 1.0
Use python:fn:`get_interacting_residues` instead.
It gives you both the residue index and the resnum.
Returns
-------
OrderedDict
Keys are (residue_number, residue_amino_acid) tuples
(e.g. ('0', 'M'), ('1', 'Q'), ...).
Values are lists of (residue_number, residue_amino_acid) tuples.
(e.g. [('0', 'M'), ('1', 'Q'), ...]).
"""
try:
from Bio.PDB import NeighborSearch
except ImportError as e:
print('Importing Biopython NeighborSearch returned an error: {}'.format(e))
print('Using the the slow version of the neighbour-finding algorithm...')
return get_interactions_between_chains_slow(model, chain_id_1, chain_id_2, r_cutoff)
# Extract the chains of interest from the model
chain_1 = None
chain_2 = None
for child in model.get_list():
if child.id == chain_id_1:
chain_1 = child
if child.id == chain_id_2:
chain_2 = child
if chain_1 is None or chain_2 is None:
raise Exception('Chains %s and %s were not found in the model' % (chain_id_1, chain_id_2))
ns = NeighborSearch(list(chain_2.get_atoms()))
interactions_between_chains = OrderedDict()
for idx, residue_1 in enumerate(chain_1):
if residue_1.resname in AMINO_ACIDS and residue_1.id[0] == ' ':
resnum_1 = str(residue_1.id[1]) + residue_1.id[2].strip()
resaa_1 = convert_aa(residue_1.get_resname(), quiet=True)
interacting_residues = set()
for atom_1 in residue_1:
interacting_residues.update(ns.search(atom_1.get_coord(), r_cutoff, 'R'))
interacting_resids = []
for residue_2 in interacting_residues:
resnum_2 = str(residue_2.id[1]) + residue_2.id[2].strip()
resaa_2 = convert_aa(residue_2.get_resname(), quiet=True)
if residue_2.resname in AMINO_ACIDS and residue_2.id[0] == ' ':
interacting_resids.append((resnum_2, resaa_2,))
if interacting_resids:
interacting_resids.sort(
key=lambda x: int(''.join([c for c in x[0] if c.isdigit()])))
interactions_between_chains[(resnum_1, resaa_1)] = interacting_resids
return interactions_between_chains
[docs]def get_interactions_between_chains_slow(model, pdb_chain_1, pdb_chain_2, r_cutoff=5):
"""
Calculate interactions between residues in pdb_chain_1 and pdb_chain_2. An
interaction is defines as a pair of residues where at least one pair of atom
is closer than r_cutoff. The default value for r_cutoff is 5 Angstroms.
.. deprecated:: 1.0
Use :func:`get_interacting_residues` instead.
It gives you both the residue index and the resnum.
"""
# Extract the chains of interest from the model
chain_1 = None
chain_2 = None
for child in model.get_list():
if child.id == pdb_chain_1:
chain_1 = child
if child.id == pdb_chain_2:
chain_2 = child
if chain_1 is None or chain_2 is None:
raise Exception(
'Chains %s and %s were not found in the model' % (pdb_chain_1, pdb_chain_2))
interactions_between_chains = OrderedDict()
for idx, residue_1 in enumerate(chain_1):
if residue_1.resname in AMINO_ACIDS and residue_1.id[0] == ' ':
resnum_1 = str(residue_1.id[1]) + residue_1.id[2].strip()
resaa_1 = convert_aa(residue_1.get_resname())
interacting_resids = []
for residue_2 in chain_2:
resnum_2 = str(residue_2.id[1]) + residue_2.id[2].strip()
resaa_2 = convert_aa(residue_2.get_resname())
r_min = None
if residue_2.resname in AMINO_ACIDS and residue_2.id[0] == ' ':
for atom_1 in residue_1:
for atom_2 in residue_2:
r = calculate_distance(atom_1, atom_2, r_cutoff)
if r is not None:
if r_min and r < r_min:
r_min = r
elif not r_min:
r_min = r
if r_min:
interacting_resids.append((resnum_2, resaa_2, r_min,))
if interacting_resids:
interactions_between_chains[(resnum_1, resaa_1)] = interacting_resids
return interactions_between_chains
# %%
[docs]def chain_is_hetatm(chain):
"""Return True if the chain is made up entirely of HETATMs.
"""
hetatms = [None] * len(chain)
for i in range(len(chain.child_list)):
res = chain.child_list[i]
hetatms[i] = res.resname not in AAA_DICT
if all(hetatms):
return True
elif not any(hetatms):
return False
else:
# Something went wrong.
sequence, numbering = get_chain_sequence_and_numbering(chain)
message = (
'Some but not all residues in chain {} are hetatms!\n'.format(chain.id) +
'sequence: {}\n'.format(sequence) +
'numbering: {}\n'.format(numbering)
)
logger.debug(message)
False
[docs]def get_aa_residues(chain):
aa_residues = [
residue.id for residue in chain
if residue.resname in AAA_DICT and not residue.id[0].split()
]
return aa_residues
[docs]def get_interacting_residues(model, r_cutoff=5, skip_hetatm_chains=True):
"""Returns all interactions between residues on different chains in `model`.
Returns
-------
dict
A dictionary of interactions between chains i (0..n-1) and j (i+1..n).
Keys are (chain_idx, chain_id, residue_idx, residue_resnum, residue_amino_acid) tuples.
(e.g. (0, 'A', 0, '0', 'M'), (0, 1, '2', 'K'), ...)
Values are a list of tuples having the same format as the keys.
You can reverse the order of keys and values like this::
complement = dict()
for key, values in get_interacting_chains(model):
for value in values:
complement.setdefault(value, set()).add(key)
You can get a list of all interacting chains using this command::
{(key[0], value[0])
for (key, values) in get_interacting_chains(model).items()
for value in values}
"""
from Bio.PDB import NeighborSearch
interactions_between_chains = dict()
# Chain 1
for chain_1_idx, chain_1 in enumerate(model):
if skip_hetatm_chains and chain_is_hetatm(chain_1):
message = (
"Skipping chain_1 with idx {} because it contains only hetatms."
.format(chain_1_idx)
)
logger.debug(message)
continue
chain_1_residue_ids = get_aa_residues(chain_1)
# Chain 2
for j, chain_2 in enumerate(model.child_list[chain_1_idx + 1:]):
chain_2_idx = chain_1_idx + 1 + j
if skip_hetatm_chains and chain_is_hetatm(chain_2):
message = (
"Skipping chain_2 with idx {} because it contains only hetatms."
.format(chain_2_idx)
)
logger.debug(message)
continue
chain_2_residue_ids = get_aa_residues(chain_2)
ns = NeighborSearch(list(chain_2.get_atoms()))
# Residue 1
for residue_1 in chain_1:
try:
residue_1_idx = chain_1_residue_ids.index(residue_1.id)
except ValueError:
continue
residue_1_resnum = str(residue_1.id[1]) + residue_1.id[2].strip()
residue_1_aa = convert_aa(residue_1.resname, quiet=True)
residue_1_key = (
chain_1_idx, chain_1.id, residue_1_idx, residue_1_resnum, residue_1_aa
)
interacting_residues = set()
for atom_1 in residue_1:
interacting_residues.update(ns.search(atom_1.get_coord(), r_cutoff, 'R'))
# Residue 2
interacting_residue_ids = []
for residue_2 in interacting_residues:
try:
residue_2_idx = chain_2_residue_ids.index(residue_2.id)
except ValueError:
continue
residue_2_resnum = str(residue_2.id[1]) + residue_2.id[2].strip()
residue_2_aa = convert_aa(residue_2.get_resname(), quiet=True)
residue_2_key = (
chain_2_idx, chain_2.id, residue_2_idx, residue_2_resnum, residue_2_aa
)
interacting_residue_ids.append(residue_2_key)
if interacting_residue_ids:
interactions_between_chains\
.setdefault(residue_1_key, set())\
.update(interacting_residue_ids)
return interactions_between_chains
# %% Additions for `pipeline_structure`
class SelectChains(Select):
"""Only accept the specified chains when saving.
"""
def __init__(self, chain_letters, ns_chain_letters=None, ns=None, r_cutoff=None):
self.chain_letters = chain_letters
self.ns_chain_letters = ns_chain_letters
self.ns = ns
self.r_cutoff = r_cutoff
[docs] def accept_residue(self, residue):
chain_id = residue.parent.id
if chain_id in self.chain_letters:
return True
elif (self.ns_chain_letters and self.ns) and (chain_id in self.ns_chain_letters):
for atom in residue:
if self.ns.search(atom.get_coord(), self.r_cutoff, 'C'):
return True
return False
class StructureParser:
"""
Attributes
----------
pdb_id : ___
domain_boundaries : list of lists of lists
Elements in the outer list correspond to domains in each chain of the
pdb. Elements of the inner list contain the start and end of each
fragment of each domain. For example, if there is only one chain
with pdb domain boundaries 1-10:20-45, this would correspond to
domain_boundaries [[[1,10],[20,45]]].
"""
def __init__(self, pdb_file, chain_ids=None, domain_defs=[]):
"""
Parameters
----------
pdb_file : str
Full path and filename of the structure.
output_dir : str
Folder where to save extracted structures and sequences.
chain_ids : list
Chains of the structure that should be kept.
"""
self.pdb_id = get_pdb_id(pdb_file)
self.pdb_file = pdb_file
self.input_structure = get_pdb_structure(self.pdb_file, self.pdb_id)
if chain_ids is None:
self.chain_ids = [chain.id for chain in self.input_structure[0].child_list]
elif isinstance(chain_ids, str):
self.chain_ids = chain_ids.split(',')
elif isinstance(chain_ids, list) or isinstance(chain_ids, tuple):
self.chain_ids = list(chain_ids)
else:
raise Exception
self.r_cutoff = 6 # remove hetatms more than x A away from the main chain(s)
self.domain_boundaries = []
for domain_def in domain_defs:
self.domain_boundaries.append(
helper.decode_domain_def(domain_def, merge=False, return_string=True)
)
self.unique_id = ('pdb_id: {}, chain_ids: {}'.format(self.pdb_id, self.chain_ids))
[docs] def get_chain_sequence_and_numbering(self, chain_id, *args, **varargs):
"""Call ``get_chain_sequence_and_numbering`` using chain with id ``chain_id``
"""
chain = self.structure[0][chain_id]
return get_chain_sequence_and_numbering(chain, *args, **varargs)
[docs] def get_chain_seqres_sequence(self, chain_id, *args, **varargs):
"""Call ``get_chain_seqres_sequence`` using chain with id ``chain_id``
"""
chain = self.structure[0][chain_id]
return get_chain_seqres_sequence(chain, *args, **varargs)
[docs] def save_structure(self, output_dir='', remove_disordered=False):
if remove_disordered:
self._unset_disordered_flags()
io = PDBIO()
io.set_structure(self.structure)
try:
# Save all chains together
outFile = op.join(output_dir, self.pdb_id + ''.join(self.chain_ids) + '.pdb')
io.save(outFile)
if len(self.chain_ids) > 1:
# Save each chain individually
for chain_id in self.chain_ids:
chain = self.structure[0][chain_id]
if chain_is_hetatm(chain):
continue
outFile = op.join(output_dir, self.pdb_id + chain_id + '.pdb')
atom_list = [atom for atom in self.structure[0][chain_id].get_atoms()]
hetatm_chain_ns = NeighborSearch(atom_list)
select = SelectChains(
chain_id, self.hetatm_chain_id, hetatm_chain_ns, self.r_cutoff)
io.save(outFile, select=select)
if len(self.chain_ids) > 2:
# Save each interacting chain pair.
for chain_ids in self.interacting_chain_ids:
outFile = op.join(output_dir, self.pdb_id + ''.join(chain_ids) + '.pdb')
atom_list = [
atom for atom
in self.structure[0][chain_id].get_atoms()
for chain_id in chain_ids
]
hetatm_chain_ns = NeighborSearch(atom_list)
select = SelectChains(
chain_ids, self.hetatm_chain_id, hetatm_chain_ns, self.r_cutoff)
io.save(outFile, select=select)
except AttributeError as e:
if remove_disordered:
raise(e)
self.save_structure(output_dir=output_dir, remove_disordered=True)
[docs] def save_sequences(self, output_dir=''):
self.chain_numbering_extended_dict = {}
self.chain_sequence_dict = {}
for chain_id in self.chain_ids:
chain_sequence, chain_numbering_extended = (
self.get_chain_sequence_and_numbering(chain_id)
)
self.chain_numbering_extended_dict[chain_id] = chain_numbering_extended
self.chain_sequence_dict[chain_id] = chain_sequence
with open(op.join(output_dir, self.pdb_id + chain_id + '.fasta'), 'w') as f:
f.write('>' + self.pdb_id + chain_id + '\n')
f.write(chain_sequence + '\n')
f.write('\n')
def _get_domain_def_idxs_for_chain(self, chain, chain_idx):
if not self.domain_boundaries or not self.domain_boundaries[chain_idx]:
return None, None, None
__, chain_numbering = get_chain_sequence_and_numbering(chain)
try:
domain_start_idxs, domain_end_idxs = [
tuple(chain_numbering.index(resid) for resid in resids)
for resids in zip(*self.domain_boundaries[chain_idx])]
except Exception as e:
print(str(e))
raise errors.PDBDomainDefsError(self.unique_id)
return chain_numbering, domain_start_idxs, domain_end_idxs
def _correct_methylated_lysines(self, res):
new_resname = 'LYS'
new_resid = (' ', res.id[1], res.id[2])
logger.debug(
'Renaming residue {} {} to {} {}'
.format(res.resname, res.id, new_resname, new_resid))
res.resname = new_resname
res.id = new_resid
atom_idx = 0
while atom_idx < len(res):
atom_id = res.child_list[atom_idx].id
if atom_id not in LYSINE_ATOMS:
logger.debug(
'Removing atom {} from residue {} {}.'.format(atom_id, res.resname, res.id)
)
res.detach_child(atom_id)
else:
atom_idx += 1
def _move_hetatm_to_hetatm_chain(self, chain, hetatm_chain, res, echo=False):
# logger.debug(
# 'Moving hetatm residue {} {} to the hetatm chain'
# .format(res.resname, res.id))
chain.detach_child(res.id)
hetatm_res = res
hetatm_res.id = (hetatm_res.id[0], len(hetatm_chain)+1, hetatm_res.id[2],)
hetatm_chain.add(hetatm_res)
def _residue_outside_domain(
self, chain, chain_numbering, domain_start_idxs, domain_end_idxs, res):
"""Returns `True` if residue ``res`` is outside the domain.
"""
if domain_start_idxs is None or domain_end_idxs is None:
return False
resid = str(res.id[1]) + res.id[2].strip()
resid_idx = chain_numbering.index(resid)
for domain_start_idx, domain_end_idx in zip(domain_start_idxs, domain_end_idxs):
if resid_idx >= domain_start_idx and resid_idx <= domain_end_idx:
# Residue is inside the domain
return False
# Residue is outside the domain
return True
def _remove_distant_hatatms(self, new_model, hetatm_chain):
"""Detach hetatms that are more than ``self.r_cutoff`` away from the main chain(s)
"""
ns = NeighborSearch(list(new_model.get_atoms()))
hetatm_chain.id = [c for c in reversed(helper.uppercase) if c not in self.chain_ids][0]
res_idx = 0
while res_idx < len(hetatm_chain):
res_1 = hetatm_chain.child_list[res_idx]
in_contact = False
for atom_1 in res_1:
interacting_residues = ns.search(atom_1.get_coord(), self.r_cutoff, 'R')
if interacting_residues:
# logger.debug(res_1.id)
# logger.debug(interacting_residues)
in_contact = True
if in_contact:
res_idx += 1
continue
# logger.debug('Detaching child: {}'.format(res_1.id))
hetatm_chain.detach_child(res_1.id)
def _unset_disordered_flags(self):
"""Change atom and residue ``disordered`` flag to `False`.
Otherwise, Biopython may crash when saving the PDB structure.
"""
logger.debug('Setting all residues and atoms marked as disorded to non-disorded')
for m in self.structure:
for c in m:
for r in c:
if r.is_disordered() or r.disordered:
logger.debug(
'Changing disordered_flag on residue {} from {} to 0'
.format(r, r.disordered))
r.disordered = 0
for a in r:
if a.is_disordered() or a.disordered_flag:
logger.debug(
'Changing disordered_flag on atom {} from {} to 0'
.format(a, a.disordered_flag))
a.disordered_flag = 0