Source code for aleimi.confgen

#!/usr/bin/env python3
# -*- coding: utf-8 -*-


# TODO
# Toma de un archivo .smi los SMILES, genera la cant de conformeros seleccionados
# e imprime tanto los .sdf con todos los conformeros como .mop para optimizar en MOPAC
# con el nivel de teoria seleccionado.


from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
import pandas as pd
import os
from typing import List

#      CHECKING geometry degeneracy
# La primera iteracion que corre por i busca que este optimizada la estructura
# La segunda busca que no se geometricamente la misma estructura
#Pueda suceder que alguna idx cumpla que no es geometricamente la misma estructura
#y no la ve el segundo ciclo
#pero el primero cuando llegue a ella la vera y tomara su id
# =============================================================================

[docs]
def confgen(mol:Chem.rdchem.Mol, rdkit_d_RMSD:float, numConfs:int, rdkit_numThreads:float = 0, UFF:bool = False):
    """Generate conformers stochastically for mol and filter them out based on geometrical and energetic criterions.

    Parameters
    ----------
    mol : Chem.rdchem.Mol
        The RDKit molecule for which the stochastic conformations will be generated.
    rdkit_d_RMSD : float
        RMSD to filter out redundant conformations, geometric filter.
    numConfs : int
        Maximum number of conformation to generate
    rdkit_numThreads : float, optional
        Number of CPUs to use, by default 0 (use all)
    UFF : bool, optional
        If True a classical mechanical optimization will be performed, by default False

    Returns
    -------
    tuple
        * RDKit molecule with conformations, the hydrogens are always added.
        * Index of conformer to use.
        * List of classic energies if UFF = True, if not, a list of None values with length, number of conformers.
    """
    molecule = Chem.AddHs(mol)
    ids = AllChem.EmbedMultipleConfs(molecule, numConfs=numConfs, numThreads=rdkit_numThreads)
    if UFF:
        opt = AllChem.UFFOptimizeMoleculeConfs(molecule, numThreads=rdkit_numThreads, ignoreInterfragInteractions=False, maxIters=500) # The result is a list a containing 2-tuples: (not_converged, energy) for each conformer. If not_converged is 0, the minimization for that conformer converged.

        print('Calculando RMSD ...')
        rmsd_reject = set()
        opt_reject = set()
        rmsd_no_reject= set()
        no_opt=0
        ids=list(ids)
        for i in ids:

            if opt[i][0]==1:
                no_opt +=1
                opt_reject.add(ids[i])
            else:
                for idx in ids:
                    if i < idx:
                        RMSD = AllChem.GetConformerRMS(molecule, i, idx, prealigned=True)
                        if RMSD <= rdkit_d_RMSD:
                            if opt[i][1] < opt[idx][1]:
                                rmsd_reject.add(ids[i])
                                rmsd_no_reject.add(ids[idx])
                            else:
                                rmsd_reject.add(ids[idx])
                                rmsd_no_reject.add(ids[i])

        reject = opt_reject.union(rmsd_reject)
        if len(rmsd_reject) !=0:
            print(f'Los confórmeros con Id {rmsd_reject} no se imprimirán en el .mop por ser iguales (Según el error de RMSD = {rdkit_d_RMSD}) a {rmsd_no_reject} respectivamente y menos favorables energéticamente que los últimos mencionados.')
        if len(opt_reject) !=0:
            print(f'Los confórmeros con Id {opt_reject} no se imprimirán en el .mop pues no lograron ser optimizados.')
        to_use = list(set(ids) - reject)
        return molecule, to_use, [item[1] for item in opt]# Here I am creating a list of tuple (molecule, energy)
    else:

        print('Calculando RMSD ...')
        rmsd_reject = set()
        rmsd_no_reject= set()
        ids=list(ids)
        for i in ids:
            for idx in ids:
                if i < idx:
                    RMSD = AllChem.GetConformerRMS(molecule, i, idx, prealigned=True)
                    if RMSD <= rdkit_d_RMSD:
                        rmsd_reject.add(ids[i])
                        rmsd_no_reject.add(ids[idx])


        if len(rmsd_reject) !=0:
            print(f'Los confórmeros con Id {rmsd_reject} no se imprimirán en el .mop por ser iguales (Según el error de RMSD = {rdkit_d_RMSD}) a {rmsd_no_reject} respectivamente y menos favorables energéticamente que los últimos mencionados.')

        to_use = list(set(ids) - rmsd_reject)
        return molecule, to_use, len(ids)*[None] # Because I did not optimize, then I add None


# =============================================================================
#exportar las imagenes de la mejor manera posible. en columnas de 1,2,3,4 o 5
# =============================================================================


[docs]
def makeimg(mols:List[Chem.rdchem.Mol], **keywords):
    """Simple function to create a picture of the input molecules.
    The file ``used_mols.png`` will be createad.

    Parameters
    ----------
    mols : List[Chem.rdchem.Mol]
        list of molecules
    """

    ms = [mol for mol in mols if mol is not None]
    _ = [AllChem.Compute2DCoords(m) for m in ms]

    cant = len(ms)
    if cant==1:
        best=1
    else:
        div = range(2,6)
        tup = [[i - cant%i,i] for i in div]

        for t in tup:
            if t[0] == t[1]:
                t[0] = 0

        minimum_rest = min(tup, key=lambda x: x[0])
        for t in tup:
            if t[0] == minimum_rest[0] and t[1]>minimum_rest[1]:
                minimum_rest = t
        best = minimum_rest[1]

    if 'legends' in keywords:
        legends = keywords['legends']
    else:
        legends = [f'Mol: {x+1}' for x in range(len(ms))]

    img=Draw.MolsToGridImage(ms,molsPerRow=best,subImgSize=(700,700), legends=legends)
    if len(ms) > 1:
        img.save('used_mols.png')
    else:
        img.save(f'{legends[0]}.png')





[docs]
def main(suppl:str, numConfs:int = 10, rdkit_d_RMSD:float = 0.2, UFF:bool = False, rdkit_numThreads:int = 0, mopac_keywords:str =  'PM7 precise ef xyz geo-ok t=3h THREADS = 2') -> List[str]: #EPS=78.4
    """Generate conformers stochastically for the supplied molecules and filter them out based on geometrical and energetic criterions.

    Parameters
    ----------
    suppl : str
        Path for the molecule file. It could be:.pdb, .mol2, .mol, .sdf, .smi. In case of .smi you can define several SMILES.
        and for all of them the configuration will be generated.
        TODO: do the same for sdf files
    numConfs : int, optional
        Maximum number of conformation to generate, by default 10
    rdkit_d_RMSD : float, optional
        RMSD to filter out redundant conformations, geometric filter, by default 0.2
    UFF : bool, optional
        If True a classical mechanical optimization will be performed, by default False
    rdkit_numThreads : int, optional
        Number of CPUs to use, by default 0 (use all)
    mopac_keywords : str, optional
        This is head of a .mop file, basically a definition of the sei-empirical job. Check
        the `MOPAC Documentation <http://openmopac.net/>`_ for more information , by default 'PM7 precise ef xyz geo-ok t=3h THREADS = 2'

    Returns
    -------
    List[str]
        List of names of the used molecules

    Raises
    ------
    ValueError
        If in the .smi there are invalid SMILES
    ValueError
        If the extension of molecule is not: .pdb, .mol2, .mol, .sdf or .smi.
    ValueError
        If the molecule was not understood by RDKit
    """

    name, ext = os.path.splitext(os.path.basename(suppl))
    print('Resumen\n')
    if ext == ".smi":
        with open(suppl, 'rt') as file:
            smiles = file.readlines()

        # =============================================================================
        #    Comprobamos que los SMILES esten bien
        # expuestas con la funcion molfilter y exportamos .sdf
        # =============================================================================
        Errors_mols = []
        for i, item in enumerate(smiles):
            try:
                Chem.MolFromSmiles(item).GetNumAtoms()
            except:
                Errors_mols.append(i+1)
        if len(Errors_mols) !=  0:
            raise ValueError(f'Las moléculas con Ids: {Errors_mols} no son estructuras SMILES correctas para RDKit. Por favor, retirelas del .smi.')


        mols = [(f'conf_{name}_{i+1}', Chem.MolFromSmiles(smile)) for (i,smile) in enumerate(smiles)]
    elif ext == '.pdb':
        mols = [(f"conf_{name}", Chem.MolFromPDBFile(suppl))]
    elif ext == '.mol2':
        mols = [(f"conf_{name}", Chem.MolFromMol2File(suppl))]
    # elif ext == '.mol':
    #     mols = [(f"conf_{name}", Chem.MolFromMolFile(suppl))]
    elif ext in ['.mol', '.sdf']:
        mols = [(f"conf_{name}", Chem.MolFromMolFile(suppl))]
    else:
        raise ValueError(f'The structure must be one of the following extensions: .pdb, .mol2, .mol, .sdf, .smi. {suppl} was provided')

    if None in [mol[1] for mol in mols]:
        raise ValueError(f"{suppl} is not understanded by RDKit")




    # =============================================================================
    # Generamos conformaciones que cumplen con las condiciones de rmsd
    # expuestas con la funcion molfilter y exportamos .sdf
    # =============================================================================

    makeimg([mol[1] for mol in mols], legends = [mol[0] for mol in mols])
    for (i, mol) in enumerate(mols):

        print(f'Se están generando {numConfs} conformaciones para la molécula con Id = {i+1}...')

        molecule, index, opt = confgen(mol[1], rdkit_d_RMSD, numConfs, rdkit_numThreads = rdkit_numThreads, UFF = UFF)
        natoms = molecule.GetNumAtoms()

        sdf_writer = Chem.SDWriter(f"{mol[0]}.sdf")
        for idx in index:
            sdf_writer.write(molecule, confId=idx)
        sdf_writer.close()
        print(f'{mol[0]}:  {len(index)}/{numConfs}')

    # =============================================================================
    #     Generando los .mop
    # =============================================================================


        print(f"Archivos de salida: {mol[0]}.sdf, {mol[0]}.mop")

        # TODO Change this part to work properlly with the attribute _Name of the molecule
        with open(f"{mol[0]}.sdf", 'rt') as file:
            lines = file.readlines()

        final=open(f"{mol[0]}.mop",'w')

        #Toma los valores reales que se usaron para imprimir el .sdf y asi buscar los opt de verdad, pq si se elimino una estructura tengo que tenerlo en cuenta
        cont = 0 #Lo inicializao en -1 pq la cantidad de veces que se encontrara la palabra RDKit coincide con la cantidad con el len(to_use[0])
        #todo esto para que en el .mop me saque el valor de energia optenido en la optimizacion
        for k, line in enumerate(lines):
            if 'RDKit          3D' in line:

                chunk = lines[(k+3):(k+3+(natoms))]
                sliced = []
                for c in chunk:
                    sliced.append(c.split())
                df = pd.DataFrame(sliced)
                to_print = df[[3, 0, 1, 2]]
                final.write(mopac_keywords+'\n')
                try:
                    comments = f"{mol[0]}   E_UFF = {round(float(opt[index[cont]]), 3)}"
                except:
                    comments = f"{mol[0]}   E_UFF = {opt[index[cont]]}"
                final.write(comments+'\n')
                final.write('CELL: %d\n' % (cont+1))
                to_print.to_string(final, header=False, index=False)
                final.write('\n0\n')
                cont += 1
        print('Número de átomos: %d\n'% (natoms))
        final.close()
    mol_names = [mol[0] for mol in mols]
    return mol_names



if __name__ == '__main__':
    pass