Source code for biointerface.core

"""Core module for extracting Protein-DNA interfaces."""

from Bio.PDB.Polypeptide import PPBuilder
from Bio.PDB.NeighborSearch import NeighborSearch
from Bio.PDB.Residue import Residue
from Bio.PDB.Atom import Atom
from Bio.PDB.PDBExceptions import PDBConstructionException

# from Bio.PDB.Structure import Structure
# from Bio.PDB.Model import Model
# from Bio.PDB.Chain import Chain
# from Bio.PDB import MMCIFIO

import pandas as pd

from PDBNucleicAcids.NucleicAcid import (
    NABuilder,
    DSNABuilder,
    DoubleStrandNucleicAcid,
)

import copy

import warnings


[docs] def build_interfaces(structure, search_radius=5.0) -> list: """ Extract all Protein-DNA interfaces found in a structure. Parameters ---------- structure : Bio.PDB.Structure Biopython Structure entity. search_radius : float | int, optional Search radius, measured in Armstrong, within which Protein-DNA interactions are found. Default is 5.0 Returns ------- list List of all Protein-DNA interfaces found in a structure. """ # build nucleic acids builder = NABuilder() na_list = builder.build_nucleic_acids(structure) if not na_list: return [] # dna_chain_ids = list({na.get_chain_id() for na in na_list}) # build peptides builder = PPBuilder() pp_list = builder.build_peptides(structure) if not pp_list: return [] prot_chain_ids = list({pp[0].parent.id for pp in pp_list}) face_list = [] for prot_chain_id in prot_chain_ids: face = Interface( structure=structure, protein_chain_id=prot_chain_id, search_radius=search_radius, ) face_list.append(face) return face_list
[docs] class Interface: """ Extract Protein-DNA interface. Parameters ---------- structure : Bio.PDB.Structure Biopython Structure entity. protein_chain_id : str Chain id of a protein that may interact with DNA. search_radius : float | int, optional Search radius, measured in Armstrong, within which Protein-DNA interactions are found. Default is 5.0 """ def __init__(self, structure, protein_chain_id, search_radius=5.0) -> None: self.structure = structure self.protein_chain_id = protein_chain_id self.search_radius = search_radius self.contacts = self._extract_contacts() dna_atoms = self.get_dna_atoms() self._dna_chain_ids = list( {atom.parent.parent.id for atom in dna_atoms} ) def __repr__(self): """Return string representation of the nucleic acid.""" return f"<Interface chains={self.protein_chain_id}:\ {''.join(self._dna_chain_ids)} contacts={len(self.contacts)} search_radius=\ {self.search_radius}>" def _extract_contacts(self) -> list[tuple[Atom]]: """ Extract interface contacts (PRIVATE). Raises ------ PDBConstructionException In case of `protein_chain_id` not being a protein chain. Returns ------- list[tuple[Atom]] List of pairs of atoms, first one is DNA, second is proteic. """ # get all the atoms from the nucleic acids, in most cases DNA na_builder = NABuilder() na_list = na_builder.build_nucleic_acids(self.structure) na_atoms = [] for na in na_list: na_atoms.extend(na.get_atoms()) na_atoms = list(set(na_atoms)) na_chain_ids = [na.get_chain_id() for na in na_list] # get all the atoms from the protein chain protein_chain = self.structure[0][self.protein_chain_id] pp_builder = PPBuilder() pp_list = pp_builder.build_peptides(protein_chain) # check if given chain id is actually a protein if not pp_list: raise PDBConstructionException( f"No polypeptides found in the input protein \ chain id: {self.protein_chain_id}" ) pp_atoms = [] for pp in pp_list: for res in pp: pp_atoms.extend(res.get_atoms()) pp_atoms = list(set(pp_atoms)) # Crea una lista con tutti gli atomi di DNA e proteina all_atoms = na_atoms + pp_atoms # Filter out hydrogens all_atoms = [atom for atom in all_atoms if atom.element != "H"] # Usa NeighborSearch per trovare gli atomi vicini entro una certa # distanza ns = NeighborSearch(all_atoms) # Cerca gli atomi vicini entro un raggio di 4 Å tra DNA e proteina all_contacts = ns.search_all(self.search_radius) # Filtra solo i contatti, ovvero le coppied di atomi, # che hanno un atomo di DNA temp = [ (atom1, atom2) for atom1, atom2 in all_contacts if ( atom1.parent.parent.id in na_chain_ids or atom2.parent.parent.id in na_chain_ids ) ] # Filtra solo i contatti tra DNA e proteina contacts = [ (atom1, atom2) for atom1, atom2 in temp if ( atom1.parent.parent.id in na_chain_ids and atom2.parent.parent.id == self.protein_chain_id ) ] + [ (atom2, atom1) for atom1, atom2 in temp if ( atom1.parent.parent.id == self.protein_chain_id and atom2.parent.parent.id in na_chain_ids ) ] # temp è utile per greedyness, prima prendi il DNA, che ha meno # atomi, escludendo quelli intra-proteina # poi escludi anche quelli intra-DNA # inoltre ci assicuriamo che la col 0 contenga gli atomi di DNA e # che col 1 contenga gli atomi di proteine # Ce ne assicuriamo invertendo atom1 e atom2 nella seconda lista return contacts
[docs] def get_atomic_contacts(self) -> list[tuple[Atom]]: """ Get interface contacts as pairs of atoms. Returns ------- list[tuple[Atom]] List of pairs of atoms, first one is DNA, second is proteic. """ return self.contacts
[docs] def get_protein_atoms(self) -> list[Atom]: """ Get only protein atoms in the protein-DNA interface. Returns ------- list[Atom] List of protein atoms in the interface. """ return list({atom_pair[1] for atom_pair in self.contacts})
[docs] def get_dna_atoms(self) -> list[Atom]: """ Get only DNA atoms in the protein-DNA interface. Returns ------- list[Atom] List of DNA atoms in the interface. """ return list({atom_pair[0] for atom_pair in self.contacts})
[docs] def get_aminoacids(self) -> list[Residue]: """ Get only protein residues in the protein-DNA interface. Returns ------- list[Residue] List of protein reisudes in the interface. """ return list({atom_pair[1].parent for atom_pair in self.contacts})
[docs] def get_nucleotides(self) -> list[Residue]: """ Get only DNA residues in the protein-DNA interface. Returns ------- list[Residue] List of DNA residues in the interface. """ return list({atom_pair[0].parent for atom_pair in self.contacts})
[docs] def get_interface_data(self) -> pd.DataFrame: """ Get all data from the interface, as a dataframe. Contains the following data fields: Residue hetero field Residue number Residue insertion code Residue name Atom name Atom alternate location Atom element Atomic coordinates (x, y, z) From both protein and DNA atoms Euclidean distance Returns ------- df : pd.DataFrame All data from the interface. """ data = [] for na_atom, prot_atom in self.contacts: prot_res_hetfield = prot_atom.parent.id[0] prot_res_number = prot_atom.parent.id[1] prot_res_icode = prot_atom.parent.id[2] prot_res_name = prot_atom.parent.resname prot_atom_name = prot_atom.name prot_atom_altloc = prot_atom.altloc prot_atom_element = prot_atom.element prot_atom_coord_x = prot_atom.coord[0] prot_atom_coord_y = prot_atom.coord[1] prot_atom_coord_z = prot_atom.coord[2] dna_chain_id = na_atom.parent.parent.id dna_res_hetfield = na_atom.parent.id[0] dna_res_number = na_atom.parent.id[1] dna_res_icode = na_atom.parent.id[2] dna_res_name = na_atom.parent.resname dna_atom_name = na_atom.name dna_atom_altloc = na_atom.altloc dna_atom_element = na_atom.element dna_atom_coord_x = na_atom.coord[0] dna_atom_coord_y = na_atom.coord[1] dna_atom_coord_z = na_atom.coord[2] euclidian_distance = na_atom - prot_atom row = ( self.protein_chain_id, prot_res_hetfield, prot_res_number, prot_res_icode, prot_res_name, prot_atom_name, prot_atom_altloc, prot_atom_element, prot_atom_coord_x, prot_atom_coord_y, prot_atom_coord_z, dna_chain_id, dna_res_hetfield, dna_res_number, dna_res_icode, dna_res_name, dna_atom_name, dna_atom_altloc, dna_atom_element, dna_atom_coord_x, dna_atom_coord_y, dna_atom_coord_z, euclidian_distance, ) data.append(row) df = pd.DataFrame( data, columns=[ "protein_chain_id", "prot_res_hetfield", "prot_res_number", "prot_res_icode", "prot_res_name", "prot_atom_name", "prot_atom_altloc", "prot_atom_element", "prot_atom_coord_x", "prot_atom_coord_y", "prot_atom_coord_z", "dna_chain_id", "dna_res_hetfield", "dna_res_number", "dna_res_icode", "dna_res_name", "dna_atom_name", "dna_atom_altloc", "dna_atom_element", "dna_atom_coord_x", "dna_atom_coord_y", "dna_atom_coord_z", "euclidian_distance", ], ) return df
[docs] def get_bound_double_strands(self) -> list[DoubleStrandNucleicAcid]: """ Get all double-strand nucleic acids bound by the protein. The output double stranded nucleic acids (DSNAs) are subsequences of the full DSNAs found in the structure, since proteins usually do not bind the whole DSNA. This method allows for "gaps" of unbound base-pairs inside the DSNA, only the base pairs at the ends are trimmed accourding to being protein-bound or not. A visual example of "gaps": ``` Input full DSNA: GATATACAAGCCA Protein-bound: **** ** Output protein-bound DSNA: TATACAAG ``` Returns ------- bound_dsna_list : list[DoubleStrandNucleicAcid] List of double-strand nucleic acids bound by the protein. """ bound_nucleotides = self.get_nucleotides() builder = DSNABuilder() dsna_list = builder.build_double_strands(self.structure) bound_dsna_list = [] for dsna in dsna_list: bound_dsna = copy.copy(dsna) while ( bound_dsna[0].i_res not in bound_nucleotides and bound_dsna[0].j_res not in bound_nucleotides ): # if the FIRST base pair isn't bound by protein # then discard it and check the next FIRST base pair bound_dsna.pop(0) while ( bound_dsna[-1].i_res not in bound_nucleotides and bound_dsna[-1].j_res not in bound_nucleotides ): # if the LAST base pair isn't bound by protein # then discard it and check the next LAST base pair bound_dsna.pop(-1) if len(bound_dsna) > 0: # in this case, there is an actual bound DSNA bound_dsna_list.append(bound_dsna) for bp in bound_dsna: if ( bp.i_res not in bound_nucleotides and bp.j_res not in bound_nucleotides ): warnings.warn( f"Warning: there are unbound base-pairs \ inside the resulting DoubleStrandNucleicAcid - {bp}" ) return bound_dsna_list
# def export_atom_list(structure_id, atom_list, out_filepath): # """Export atom list.""" # # not in Path but in string # out_filepath = str(out_filepath) # new_structure = Structure.Structure(structure_id) # for atom in atom_list: # _add_atom_to_new_structure(atom, new_structure) # # Prepare IO object # io = MMCIFIO() # io.set_structure(new_structure) # # Esporta la nuova struttura in un file PDB # # necessita di string type filepath # io.save(out_filepath) # def _add_atom_to_new_structure(atom, new_structure): # model_id = ( # atom.get_parent().get_parent().get_parent().id # ) # Ottieni l'ID del modello # chain_id = atom.get_parent().get_parent().id # Ottieni l'ID della catena # residue_id = atom.get_parent().id # Ottieni l'ID del residuo # resname = atom.get_parent().resname # Nome del residuo # # Controlla se il modello esiste già nel nuovo modello # if model_id in [model.id for model in new_structure]: # new_model = new_structure[model_id] # else: # new_model = Model.Model(model_id) # new_structure.add(new_model) # # Controlla se la catena esiste già nel nuovo modello # if chain_id in [chain.id for chain in new_model]: # new_chain = new_model[chain_id] # else: # new_chain = Chain.Chain(chain_id) # new_model.add(new_chain) # # Controlla se il residuo esiste già nella nuova catena # if residue_id in [res.id for res in new_chain]: # new_residue = new_chain[residue_id] # else: # new_residue = Residue.Residue( # residue_id, resname, atom.get_parent().segid # ) # new_chain.add(new_residue) # # Copia l'atomo e aggiungilo al residuo # new_atom = atom.copy() # new_residue.add(new_atom)