Source code for biointerface.core

"""Core module for extracting Protein-nucleic acid interfaces."""

from Bio.PDB.Polypeptide import PPBuilder, Polypeptide
from Bio.PDB.NeighborSearch import NeighborSearch
from Bio.PDB.PDBExceptions import PDBConstructionException

from Bio.PDB.Structure import Structure
from Bio.PDB.Model import Model
from Bio.PDB.Chain import Chain
from Bio.PDB.Residue import Residue
from Bio.PDB.Atom import Atom

import pandas as pd

from PDBNucleicAcids.NucleicAcid import NABuilder, NucleicAcid
from PDBNucleicAcids.NucleicAcid import DSNABuilder, DoubleStrandNucleicAcid
from PDBNucleicAcids.BasePairRules import BasePairRules
from PDBNucleicAcids.BasePairRules import WatsonCrickBasePairRules

import copy

import warnings
from typing import Literal



[docs]
class Interface:
    """
    Class for Protein-nucleic acid interface.

    :param pp_atoms: Protein atoms.
    :type pp_atoms: list[Atom]

    :param na_atoms: Nucleic atoms.
    :type na_atoms: list[Atom]

    :param search_radius: Search radius, measured in Angstrom, within which
        Protein-nucleic acid interactions are found. Default is 4.0.
    :type search_radius: float | int

    :param pp_list: List of proteins found in the entity given
        to the builder. Default is an empty list.
    :type pp_list: list[Polypeptide]

    :param na_list: List of nucleic acids found in the entity given
        to the builder. Default is an empty list.
    :type na_list: list[NucleicAcid]

    :param dsna_list: List of double stranded nucleic acids found in the entity
        given to the builder. Default is an empty list.
    :type dsna_list: list[DoubleStrandNucleicAcid]

    """

    def __init__(
        self,
        pp_atoms: list[Atom],
        na_atoms: list[Atom],
        search_radius: int | float = 4.0,
        pp_list: list[Polypeptide] = [],
        na_list: list[NucleicAcid] = [],
        dsna_list: list[DoubleStrandNucleicAcid] = [],
    ) -> None:
        # save parameters
        self.search_radius = search_radius

        # compute other important arguments
        self.contacts = self._extract_contacts(
            pp_atoms=pp_atoms, na_atoms=na_atoms
        )
        self.binding_pp_list: list[Polypeptide] = (
            self._extract_binding_proteins(pp_list=pp_list)
        )
        self.bound_na_list: list[NucleicAcid] = (
            self._extract_bound_nucleic_acids(na_list=na_list)
        )
        self.bound_dsna_list: list[DoubleStrandNucleicAcid] = (
            self._extract_bound_double_strands(dsna_list=dsna_list)
        )

    def __repr__(self) -> str:
        """Return string representation of the interface."""
        if self.binding_pp_list and self.bound_na_list:
            pp_chain_ids = [pp[0].parent.id for pp in self.binding_pp_list]
            na_chain_ids = [na.get_chain_id() for na in self.bound_na_list]
        else:
            pp_chain_ids = {aa.parent.id for aa in self.get_aminoacids()}  # type: ignore
            na_chain_ids = {nu.parent.id for nu in self.get_nucleotides()}  # type: ignore
        return f"<Interface protein_chains={','.join(pp_chain_ids)} \
nucleic_chains={','.join(na_chain_ids)} \
contacts={len(self.contacts)} search_radius={self.search_radius}>"

    def _extract_contacts(
        self, pp_atoms: list[Atom], na_atoms: list[Atom]
    ) -> list[tuple[Atom, Atom]]:
        """
        Extract interface contacts. (PRIVATE)

        Parameters
        ----------
        pp_atoms : list[Atom]
            Protein atoms.
        na_atoms : list[Atom]
            Nucleic atoms.

        Returns
        -------
        list[tuple[Atom, Atom]]
            List of pairs of atoms, first one is from the nucleic acids,
            second one is from the protein.

        """
        # tag atoms
        for atom in pp_atoms:
            atom._type = "proteic"  # type: ignore
        for atom in na_atoms:
            atom._type = "nucleic"  # type: ignore

        # build list of all atoms, both nucleic acid and protein
        all_atoms = na_atoms + pp_atoms

        # look for contacts between nucleic acid and protein
        # within a given radius
        ns = NeighborSearch(all_atoms)
        all_contacts = ns.search_all(self.search_radius)

        if not all_contacts:
            # delete tags
            for atom in all_atoms:
                del atom._type  # type: ignore
            return []

        # filter possible contacts, meaning the atom couples
        # with at least one nucleic acid atom
        # greedy: exclude intra-protein
        filt_contacts = [
            (atom1, atom2)
            for atom1, atom2 in all_contacts
            if atom1._type == "nucleic" or atom2._type == "nucleic"
        ]

        # filter possible contacts, meaning the atom couples
        # with one nucleic acid atom and one protein atom
        # greedy: exclude intra-nucleic
        contacts: list[tuple[Atom, Atom]] = [
            (atom1, atom2)
            for atom1, atom2 in filt_contacts
            if atom1._type == "nucleic" and atom2._type == "proteic"
        ] + [
            (atom2, atom1)
            for atom1, atom2 in filt_contacts
            if atom1._type == "proteic" and atom2._type == "nucleic"
        ]
        # col 0 must contain nucleic atoms
        # col 1 must contain protein atoms

        # delete tags
        for atom in all_atoms:
            del atom._type  # type: ignore

        return contacts


[docs]
    def get_atomic_contacts(self) -> list[tuple[Atom, Atom]]:
        """
        Get interface contacts as pairs of atoms.

        Returns
        -------
        list[tuple[Atom, Atom]]
            List of pairs of atoms, first one is from the nucleic acids,
            second one is from the protein.

        """
        return self.contacts



[docs]
    def get_protein_atoms(self) -> list[Atom]:
        """
        Get only protein atoms in the protein-nucleic acid interface.

        Returns
        -------
        list[Atom]
            List of protein atoms in the interface.

        """
        return list({atom_pair[1] for atom_pair in self.contacts})



[docs]
    def get_nucleic_acid_atoms(self) -> list[Atom]:
        """
        Get only nucleic acid atoms in the protein-nucleic acid interface.

        Returns
        -------
        list[Atom]
            List of nucleic acid atoms in the interface.

        """
        return list({atom_pair[0] for atom_pair in self.contacts})



[docs]
    def get_aminoacids(self) -> list[Residue]:
        """
        Get only protein residues in the protein-nucleic acid interface.

        Returns
        -------
        list[Residue]
            List of protein residues in the interface.

        """
        return list(
            {
                atom_pair[1].parent  # type: ignore
                for atom_pair in self.contacts
            }
        )



[docs]
    def get_nucleotides(self) -> list[Residue]:
        """
        Get only nucleic acid residues in the protein-nucleic acid interface.

        Returns
        -------
        list[Residue]
            List of nucleic acid residues in the interface.

        """
        return list(
            {
                atom_pair[0].parent  # type: ignore
                for atom_pair in self.contacts
            }
        )



[docs]
    def as_dataframe(self) -> pd.DataFrame:
        """
        Get all data from the interface, as a dataframe.

        Contains the following data fields:
            Residue hetero field
            Residue number
            Residue insertion code
            Residue name
            Atom name
            Atom alternate location
            Atom element
            Atomic coordinates (x, y, z)
            From both protein and nucleic acid atoms
            Euclidean distance between atom pair in contact

        Returns
        -------
        df : pd.DataFrame
            All data from the interface.

        """
        protein_chain_id: str = self.get_aminoacids()[0].parent.id  # type: ignore

        data = []

        for na_atom, prot_atom in self.contacts:
            prot_res_hetfield = prot_atom.parent.id[0]  # type: ignore
            prot_res_number = prot_atom.parent.id[1]  # type: ignore
            prot_res_icode = prot_atom.parent.id[2]  # type: ignore
            prot_res_name = prot_atom.parent.resname  # type: ignore
            prot_atom_name = prot_atom.name
            prot_atom_altloc = prot_atom.altloc
            prot_atom_element = prot_atom.element
            prot_atom_coord_x = prot_atom.coord[0]
            prot_atom_coord_y = prot_atom.coord[1]
            prot_atom_coord_z = prot_atom.coord[2]

            dna_chain_id = na_atom.parent.parent.id  # type: ignore
            dna_res_hetfield = na_atom.parent.id[0]  # type: ignore
            dna_res_number = na_atom.parent.id[1]  # type: ignore
            dna_res_icode = na_atom.parent.id[2]  # type: ignore
            dna_res_name = na_atom.parent.resname  # type: ignore
            dna_atom_name = na_atom.name
            dna_atom_altloc = na_atom.altloc
            dna_atom_element = na_atom.element
            dna_atom_coord_x = na_atom.coord[0]
            dna_atom_coord_y = na_atom.coord[1]
            dna_atom_coord_z = na_atom.coord[2]

            euclidean_distance = na_atom - prot_atom

            row = (
                protein_chain_id,
                prot_res_hetfield,
                prot_res_number,
                prot_res_icode,
                prot_res_name,
                prot_atom_name,
                prot_atom_altloc,
                prot_atom_element,
                prot_atom_coord_x,
                prot_atom_coord_y,
                prot_atom_coord_z,
                dna_chain_id,
                dna_res_hetfield,
                dna_res_number,
                dna_res_icode,
                dna_res_name,
                dna_atom_name,
                dna_atom_altloc,
                dna_atom_element,
                dna_atom_coord_x,
                dna_atom_coord_y,
                dna_atom_coord_z,
                euclidean_distance,
            )

            data.append(row)

        df = pd.DataFrame(
            data,
            columns=[
                "protein_chain_id",
                "prot_res_hetfield",
                "prot_res_number",
                "prot_res_icode",
                "prot_res_name",
                "prot_atom_name",
                "prot_atom_altloc",
                "prot_atom_element",
                "prot_atom_coord_x",
                "prot_atom_coord_y",
                "prot_atom_coord_z",
                "dna_chain_id",
                "dna_res_hetfield",
                "dna_res_number",
                "dna_res_icode",
                "dna_res_name",
                "dna_atom_name",
                "dna_atom_altloc",
                "dna_atom_element",
                "dna_atom_coord_x",
                "dna_atom_coord_y",
                "dna_atom_coord_z",
                "euclidean_distance",
            ],
        )

        return df



[docs]
    def get_binding_proteins(self) -> list[Polypeptide]:
        """
        Get all nucleic acid binding proteins.

        Returns
        -------
        list[Polypeptide]
            List of nucleic acid binding proteins.

        """
        return self.binding_pp_list



[docs]
    def get_binding_domains(
        self, upstream_pad: int = 0, downstream_pad: int = 0
    ) -> list[Polypeptide]:
        """
        Get nucleic acid binding domains from the binding proteins.

        The output is the binding "gapped" subsequence of the full protein
        found in the structure.

        This method allows for "gaps" of unbound aminoacids inside the
        binding domain, only the aminoacids at the ends are trimmed according
        to being bound to nucleic acids (NAs) or not.

        A visual example of "gaps":
            ```
            Input full protein:          MQMLLNHKPTKFNGAIDERFHWKVIQRISGSEG
            NA-bound:                               ****  **
            Output binding domain:                  FNGAIDER
            ```

        This method is only an inference of the NA-binding domain:
        while the output will likely align with the annotated true domain,
        it'll likely not infer the whole domain. This is because a domain is
        defined by folding properties, while this method is much more naive.
        This is why I implemented some "padding" on both ends of the binding
        domain, it allows to be more lenient of the extent of the binding
        domain.

        :param upstream_pad: Number of non-binding residues,
            upstream of the first binding residue, to take inside the binding
            domain. Allows some leniency on what is considered a binding
            domain.
        :type upstream_pad: int

        :param downstream_pad: Number of non-binding residues,
            downstream of the last binding residue, to take inside the binding
            domain. Allows some leniency on what is considered a binding
            domain.
        :type downstream_pad: int

        Returns
        -------
        list[Polypeptide]
            List of nucleic acid binding domains.

        """
        binding_aminoacids = self.get_aminoacids()
        binding_pp_list = self.get_binding_proteins()

        binding_domain_list = []
        for pp in binding_pp_list:
            # find the start and the end of the domain
            start, end = None, None
            for res in pp:
                if res in binding_aminoacids and start is None:
                    start: int | None = pp.index(res)
                if res in binding_aminoacids:
                    end: int | None = pp.index(res)

            # skip non binding NAs
            if start is None or end is None:
                continue

            # add padding
            # TODO add padding as init parameter, also for nucleic acids
            start = start - upstream_pad  # type: ignore
            end = end + downstream_pad  # type: ignore
            if start < 0:
                start = 0
            if end > len(pp) - 1:
                end = len(pp) - 1

            # finally get binding domain
            binding_domain = pp[start : end + 1]  # type: ignore
            binding_domain = Polypeptide(binding_domain)
            binding_domain_list.append(binding_domain)

        return binding_domain_list



[docs]
    def get_bound_nucleic_acids(self) -> list[NucleicAcid]:
        """
        Get all nucleic acids bound by the protein.

        Returns
        -------
        list[NucleicAcid]
            List of nucleic acids bound by the protein.

        """
        return self.bound_na_list



[docs]
    def get_trimmed_nucleic_acids(self) -> list[NucleicAcid]:
        """
        Get all nucleic acids bound by the protein, but trimmed by binding.

        The output nucleic acids (NAs) are subsequences
        of the full NAs found in the structure,
        since proteins might not bind the whole NA.

        This method allows for "gaps" of unbound nucleotides inside the
        NA, only the nucleotides at the ends are trimmed according
        to being protein-bound or not.

        A visual example of "gaps":
            ```
            Input full NA:            GATATACAAGCCA
            Protein-bound:              ****  **
            Output protein-bound NA:    TATACAAG
            ```

        Returns
        -------
        list[NucleicAcid]
            List of nucleic acids bound by the protein, but trimmed by binding.

        """
        bound_nucleotides = self.get_nucleotides()
        bound_na_list = self.get_bound_nucleic_acids()

        trimmed_na_list = []
        for na in bound_na_list:
            # find the start and the end of the bound NA
            start, end = None, None
            for res in na:
                if res in bound_nucleotides and start is None:
                    start: int | None = na.index(res)
                if res in bound_nucleotides:
                    end: int | None = na.index(res)

            # skip non bound NAs
            if not start or not end:
                continue

            # address padding
            # start = start - upstream_pad  # type: ignore
            # end = end + downstream_pad  # type: ignore
            # if start < 0:  # type: ignore
            #     start = 0
            # if end > len(protein) - 1:  # type: ignore
            #     end = len(protein) - 1

            # finally get a substring of the nucleic acid
            sub_na = na[start : end + 1]
            sub_na = NucleicAcid(sub_na)
            trimmed_na_list.append(sub_na)

        return trimmed_na_list



[docs]
    def get_bound_double_strands(self) -> list[DoubleStrandNucleicAcid]:
        """
        Get all double strand nucleic acids bound by the protein.

        Returns
        -------
        list[DoubleStrandNucleicAcid]
            List of double strand nucleic acids bound by the protein.

        """
        return self.bound_dsna_list



[docs]
    def get_trimmed_double_strands(self) -> list[DoubleStrandNucleicAcid]:
        """
        Get all double-strand nucleic acids bound by the protein,
        but trimmed by binding.

        The output double stranded nucleic acids (DSNAs) are subsequences
        of the full DSNAs found in the structure,
        since proteins usually do not bind the whole DSNA found in a PDB
        structure.

        This method allows for "gaps" of unbound base pairs inside the
        DSNA, only the base pairs at the ends are trimmed according
        to being protein-bound or not.

        A visual example of "gaps":
            ```
            Input full DSNA:            GATATACAAGCCA
                                        |||||||||||||
                                        TGGCTTGTATATC
            Protein-bound:                ****  **
            Output protein-bound DSNA:    TATACAAG
                                          ||||||||
                                          CTTGTATA
            ```

        Returns
        -------
        list[DoubleStrandNucleicAcid]
            List of double stranded nucleic acids bound by the protein,
            but trimmed by binding.

        """
        bound_nucleotides = self.get_nucleotides()
        bound_dsna_list = self.get_bound_double_strands()

        trimmed_dsna_list = []
        for dsna in bound_dsna_list:
            bound_dsna = copy.copy(dsna)
            while (
                len(bound_dsna) > 0
                and bound_dsna[0].i_res not in bound_nucleotides
                and bound_dsna[0].j_res not in bound_nucleotides
            ):
                # if the FIRST base pair isn't bound by protein
                # then discard it and check the next FIRST base pair
                bound_dsna.pop(0)

            while (
                len(bound_dsna) > 0
                and bound_dsna[-1].i_res not in bound_nucleotides
                and bound_dsna[-1].j_res not in bound_nucleotides
            ):
                # if the LAST base pair isn't bound by protein
                # then discard it and check the next LAST base pair
                bound_dsna.pop(-1)

            if len(bound_dsna) > 0:
                # in this case, there is an actual bound DSNA
                trimmed_dsna_list.append(bound_dsna)

                unbound_bps = []
                for bp in bound_dsna:
                    if (
                        bp.i_res not in bound_nucleotides
                        and bp.j_res not in bound_nucleotides
                    ):
                        unbound_bps.append(bp)

                if unbound_bps:
                    warnings.warn(
                        f"There are {len(unbound_bps)} unbound \
base pairs inside {bound_dsna}: {unbound_bps}"
                    )

        return trimmed_dsna_list


    def _extract_binding_proteins(
        self, pp_list: list[Polypeptide]
    ) -> list[Polypeptide]:
        """
        Get all proteins bound by the protein. (PRIVATE)

        :param pp_list: List of proteins found in the entity given
            to the builder.
        :type na_list: list[Polypeptide]

        Returns
        -------
        list[Polypeptide] | None
            Nucleic acids binding proteins.

        """
        binding_aminoacids = set(self.get_aminoacids())

        # add nucleic acid if it has intersection with bound nucleotides
        bound_pp_list = [pp for pp in pp_list if set(pp) & binding_aminoacids]

        return bound_pp_list

    def _extract_bound_nucleic_acids(
        self, na_list: list[NucleicAcid]
    ) -> list[NucleicAcid]:
        """
        Get all nucleic acids bound by the protein. (PRIVATE)

        :param na_list: List of nucleic acids found in the entity given
            to the builder.
        :type na_list: list[NucleicAcid]

        Returns
        -------
        list[NucleicAcid] | None
            List of nucleic acids bound by the protein.

        """
        bound_nucleotides = set(self.get_nucleotides())

        # add nucleic acid if it has intersection with bound nucleotides
        bound_na_list = [na for na in na_list if set(na) & bound_nucleotides]

        return bound_na_list

    def _extract_bound_double_strands(
        self, dsna_list: list[DoubleStrandNucleicAcid]
    ) -> list[DoubleStrandNucleicAcid]:
        """
        Get all double-strand nucleic acids bound by the protein. (PRIVATE)

        :param dsna_list: List of double strand nucleic acids found in the
            entity given to the builder.
        :type dsna_list: list[DoubleStrandNucleicAcid]

        Returns
        -------
        list[DoubleStrandNucleicAcid]
            List of double-strand nucleic acids bound by the protein.

        """
        bound_nucleotides = self.get_nucleotides()

        # add double strand nucleic acid
        # if one of the strand has intersection with bound nucleotides
        bound_dsna_list = [
            dsna
            for dsna in dsna_list
            if set(dsna.get_i_strand()) & set(bound_nucleotides)
            or set(dsna.get_j_strand()) & set(bound_nucleotides)
        ]

        return bound_dsna_list



# def fixed_protein_atoms_number(self, num_atoms) -> None:
#     """Filter contacts by a fixed number of protein atoms."""
#     # cast list into dataframe, ready to be sorted
#     df = pd.DataFrame(self.contacts, columns=["na_atom", "protein_atom"])
#     df["euclidean_distance"] = df.apply(
#         lambda row: row["na_atom"] - row["protein_atom"], axis=1
#     )

#     # aggregate: for each atom, its minimum distance from DSNA
#     agg = df.groupby(["protein_atom"]).min()
#     agg = agg.reset_index()
#     agg = agg.sort_values(by="euclidean_distance", ascending=True)

#     # get closest n atoms to DSNA
#     top_protein_atoms = agg.head(num_atoms)["protein_atom"].tolist()

#     if len(top_protein_atoms) <= num_atoms:
#         raise Exception("Not enough atoms.")

#     # select contacts by top n atoms
#     selected_contacts = [
#         (na_atom, protein_atom)
#         for na_atom, protein_atom in self.contacts
#         if protein_atom in top_protein_atoms
#     ]

#     self.contacts = selected_contacts

# def fixed_na_atoms_number(self, num_atoms) -> None:
#     """Filter contacts by a fixed number of nucleic acid atoms."""
#     # cast list into dataframe, ready to be sorted
#     df = pd.DataFrame(self.contacts, columns=["na_atom", "protein_atom"])
#     df["euclidean_distance"] = df.apply(
#         lambda row: row["na_atom"] - row["protein_atom"], axis=1
#     )

#     # aggregate: for each atom, its minimum distance from DSNA
#     agg = df.groupby(["na_atom"]).min()
#     agg = agg.reset_index()
#     agg = agg.sort_values(by="euclidean_distance", ascending=True)

#     # get closest n atoms to DSNA
#     top_na_atoms = agg.head(num_atoms)["na_atom"].tolist()

#     if len(top_na_atoms) <= num_atoms:
#         raise Exception("Not enough atoms.")

#     # select contacts by top n atoms
#     selected_contacts = [
#         (na_atom, protein_atom)
#         for na_atom, protein_atom in self.contacts
#         if na_atom in top_na_atoms
#     ]

#     self.contacts = selected_contacts



[docs]
class InterfaceBuilder:
    """
    Use atomic distance to find Protein-Nucleic acid interfaces.

    Assuming you *only* want standard nucleotides and amino acids.

    Parameters
    ----------
    search_radius : float | int, optional
        Search radius, measured in Angstrom, within which
        Protein-Nucleic acid interactions are found. Default is 4.0
    pp_builder : PPBuilder, optional
        Polypeptide builder class from Biopython. Default is ``PPBuilder``
        with default parameters.
    na_builder : NABuilder, optional
        Polypeptide builder class from PDBNucleicAcids.
        Default is ``NABuilder`` with default parameters.
    dsna_builder : DSNABuilder, optional
        Polypeptide builder class from PDBNucleicAcids.
        Default is ``DSNABuilder`` with default parameters.
    """

    def __init__(
        self,
        search_radius: float | int = 4.0,
        pp_builder: PPBuilder = PPBuilder(),
        na_builder: NABuilder = NABuilder(),
        dsna_builder: DSNABuilder = DSNABuilder(),
    ) -> None:
        self.search_radius = search_radius
        self.pp_builder = pp_builder
        self.na_builder = na_builder
        self.dsna_builder = dsna_builder


[docs]
    def build_interfaces(
        self,
        entity: Structure | Model | Chain,
        by: Literal["polypeptide", "chain", "structure"] = "polypeptide",
        standard_aminoacids: bool = True,
        standard_nucleotides: bool = True,
        pairing_rules: BasePairRules = WatsonCrickBasePairRules(),
    ) -> list[Interface]:
        """
        Extract all Protein-Nucleic acid interfaces found in a PDB entity.

        Parameters
        ----------
        entity : L{Structure}, L{Model} or L{Chain}
            Protein-nucleic acid interfaces are searched for in this object.
            L{Structure} is the suggested input.
        by: str, optional
            If 'polypeptide', interfaces are extracted between nucleic acids
            bound by one polypeptide.
            If 'chain', interfaces are extracted between nucleic acids
            bound by one protein chain, composed by one or more polypeptides.
            If 'structure', interfaces are extracted between nucleic acids
            bound by all protein chains present in the structure,
            composed by one or more polypeptides. Most likely several
            polypeptides.
        standard_aminoacids: bool, optional
            Use only standard aminoacids. This is the `aa_only` parameter
            in the ``PPBuilder.build_peptides()`` method. Default is True.
        standard_nucleotides: bool, optional
            Use only standard nucleotides. This parameter is used
            in the ``NABuilder.build_nucleic_acids()`` method and
            in the ``DSNABuilder.build_double_strands()`` method.
            Default is True.
        pairing_rules : optional
            Rules for proper base pairing class instance from PDBNucleicAcids.
            This parameter is used in the
            ``DSNABuilder.build_double_strands()`` method.
            Default is ``WatsonCrickBasePairRules()`` with default parameters.

        Raises
        ------
            PDBConstructionException: In case there is no protein
                in the input entity.
            PDBConstructionException:  In case there is no nucleic acid
                in the input entity.

        Returns
        -------
        list[Interface]
            List of all Protein-Nucleic acid interfaces found in a PDB entity.

        """
        # build nucleic acids
        na_list = self.na_builder.build_nucleic_acids(
            entity=entity, standard_nucleotides=standard_nucleotides
        )

        # build double stranded nucleic acids
        dsna_list = self.dsna_builder.build_double_strands(
            entity=entity,
            standard_nucleotides=standard_nucleotides,
            pairing_rules=pairing_rules,
        )

        # check if there are nucleic acids
        if not na_list:
            raise PDBConstructionException(
                f"No nucleic acids found in the input entity {entity}"
            )

        # get all the atoms from the nucleic acids
        na_atoms = []
        for na in na_list:
            na_atoms.extend(na.get_atoms())
        na_atoms = list(set(na_atoms))

        # build the proteins
        pp_list: list[Polypeptide] = self.pp_builder.build_peptides(
            entity=entity, aa_only=standard_aminoacids
        )

        # check if there are proteins
        if not pp_list:
            raise PDBConstructionException(
                f"No polypeptides found in the input entity {entity}"
            )

        # find chains with multiple polypeptides
        chain2pp: dict[str, list[Polypeptide]] = {}
        for pp in pp_list:
            chain_id: str = pp[0].parent.id
            if chain_id not in chain2pp.keys():
                chain2pp[chain_id] = [pp]
            else:
                chain2pp[chain_id].append(pp)

        chain2count: dict[str, int] = {
            chain_id: len(pps)
            for chain_id, pps in chain2pp.items()
            if len(pps) > 1
        }

        if len(chain2count) > 1:
            warnings.warn(
                f"Found protein chains divided into multiple polypeptides \
in input entity {entity}: {chain2count}. \
Using argument `by='chain' is advised.`"
            )

        # initialize output
        face_list = []

        if by == "polypeptide":
            for pp in pp_list:
                # get all the atoms from the proteins
                pp_atoms = []
                for res in pp:
                    pp_atoms.extend(res.get_atoms())
                pp_atoms = list(set(pp_atoms))

                # actually building
                face = Interface(
                    pp_atoms=pp_atoms,
                    na_atoms=na_atoms,
                    search_radius=self.search_radius,
                    pp_list=pp_list,
                    na_list=na_list,
                    dsna_list=dsna_list,
                )

                if face.contacts:
                    face_list.append(face)

        elif by == "chain":
            pp_list_list = list(chain2pp.values())

            for pp_list in pp_list_list:
                # get all the atoms from the proteins
                pp_atoms = []
                for pp in pp_list:
                    for res in pp:
                        pp_atoms.extend(res.get_atoms())
                pp_atoms = list(set(pp_atoms))

                # actually building
                face = Interface(
                    pp_atoms=pp_atoms,
                    na_atoms=na_atoms,
                    search_radius=self.search_radius,
                    pp_list=pp_list,
                    na_list=na_list,
                    dsna_list=dsna_list,
                )

                if face.contacts:
                    face_list.append(face)

        elif by == "structure":
            # get all the atoms from the proteins
            pp_atoms = []
            for pp in pp_list:
                for res in pp:
                    pp_atoms.extend(res.get_atoms())
            pp_atoms = list(set(pp_atoms))

            # actually building
            face = Interface(
                pp_atoms=pp_atoms,
                na_atoms=na_atoms,
                search_radius=self.search_radius,
                pp_list=pp_list,
                na_list=na_list,
                dsna_list=dsna_list,
            )

            if face.contacts:
                face_list.append(face)

        else:
            msg = 'Argument `by` must be one of ["polypeptide", "chain", \
"structure"]'
            raise TypeError(msg)

        if not face_list:
            warnings.warn(
                f"No Protein-Nucleic acids interfaces found \
from polypeptides {pp_list} and nucleic acids {na_list}, \
with search radius {self.search_radius}, \
in entity {entity}."
            )

        return face_list





[docs]
def concat_polypeptides(pp_list: list[Polypeptide]) -> list[Polypeptide]:
    # find chains with multiple polypeptides
    chain2pps: dict[str, list[Polypeptide]] = {}
    for pp in pp_list:
        chain_id: str = pp[0].parent.id
        if chain_id not in chain2pps.keys():
            chain2pps[chain_id] = [pp]
        else:
            chain2pps[chain_id].append(pp)

    out = []
    for chain_id, pp_list_2 in chain2pps.items():
        # reorder list of polypeptides by residue number
        # in ascending order
        pp_list_2 = sorted(pp_list_2, key=lambda pp: pp[0].id[1])

        # extend polypeptides
        concat_pp = pp_list_2.pop(0)
        for pp in pp_list_2:
            concat_pp.extend(pp)

        # check if the new polypeptide has sorted residues
        res_ids: list[int] = [res.id[1] for res in concat_pp]
        if res_ids != sorted(res_ids):
            warnings.warn(
                f"Concatenating polypeptides from chain {chain_id}: residues \
of resulting polypeptide {concat_pp} are not in correct order."
            )

        out.append(concat_pp)

    return out
Source code for biointerface.core

BioInterface

Navigation

Related Topics