Source code for aleimi.confgen

#!/usr/bin/env python3
# -*- coding: utf-8 -*-


# TODO
# Toma de un archivo .smi los SMILES, genera la cant de conformeros seleccionados
# e imprime tanto los .sdf con todos los conformeros como .mop para optimizar en MOPAC
# con el nivel de teoria seleccionado.


from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
import pandas as pd
import os
from typing import List

#      CHECKING geometry degeneracy
# La primera iteracion que corre por i busca que este optimizada la estructura
# La segunda busca que no se geometricamente la misma estructura
#Pueda suceder que alguna idx cumpla que no es geometricamente la misma estructura
#y no la ve el segundo ciclo
#pero el primero cuando llegue a ella la vera y tomara su id
# =============================================================================
[docs] def confgen(mol:Chem.rdchem.Mol, rdkit_d_RMSD:float, numConfs:int, rdkit_numThreads:float = 0, UFF:bool = False): """Generate conformers stochastically for mol and filter them out based on geometrical and energetic criterions. Parameters ---------- mol : Chem.rdchem.Mol The RDKit molecule for which the stochastic conformations will be generated. rdkit_d_RMSD : float RMSD to filter out redundant conformations, geometric filter. numConfs : int Maximum number of conformation to generate rdkit_numThreads : float, optional Number of CPUs to use, by default 0 (use all) UFF : bool, optional If True a classical mechanical optimization will be performed, by default False Returns ------- tuple * RDKit molecule with conformations, the hydrogens are always added. * Index of conformer to use. * List of classic energies if UFF = True, if not, a list of None values with length, number of conformers. """ molecule = Chem.AddHs(mol) ids = AllChem.EmbedMultipleConfs(molecule, numConfs=numConfs, numThreads=rdkit_numThreads) if UFF: opt = AllChem.UFFOptimizeMoleculeConfs(molecule, numThreads=rdkit_numThreads, ignoreInterfragInteractions=False, maxIters=500) # The result is a list a containing 2-tuples: (not_converged, energy) for each conformer. If not_converged is 0, the minimization for that conformer converged. print('Calculando RMSD ...') rmsd_reject = set() opt_reject = set() rmsd_no_reject= set() no_opt=0 ids=list(ids) for i in ids: if opt[i][0]==1: no_opt +=1 opt_reject.add(ids[i]) else: for idx in ids: if i < idx: RMSD = AllChem.GetConformerRMS(molecule, i, idx, prealigned=True) if RMSD <= rdkit_d_RMSD: if opt[i][1] < opt[idx][1]: rmsd_reject.add(ids[i]) rmsd_no_reject.add(ids[idx]) else: rmsd_reject.add(ids[idx]) rmsd_no_reject.add(ids[i]) reject = opt_reject.union(rmsd_reject) if len(rmsd_reject) !=0: print(f'Los confórmeros con Id {rmsd_reject} no se imprimirán en el .mop por ser iguales (Según el error de RMSD = {rdkit_d_RMSD}) a {rmsd_no_reject} respectivamente y menos favorables energéticamente que los últimos mencionados.') if len(opt_reject) !=0: print(f'Los confórmeros con Id {opt_reject} no se imprimirán en el .mop pues no lograron ser optimizados.') to_use = list(set(ids) - reject) return molecule, to_use, [item[1] for item in opt]# Here I am creating a list of tuple (molecule, energy) else: print('Calculando RMSD ...') rmsd_reject = set() rmsd_no_reject= set() ids=list(ids) for i in ids: for idx in ids: if i < idx: RMSD = AllChem.GetConformerRMS(molecule, i, idx, prealigned=True) if RMSD <= rdkit_d_RMSD: rmsd_reject.add(ids[i]) rmsd_no_reject.add(ids[idx]) if len(rmsd_reject) !=0: print(f'Los confórmeros con Id {rmsd_reject} no se imprimirán en el .mop por ser iguales (Según el error de RMSD = {rdkit_d_RMSD}) a {rmsd_no_reject} respectivamente y menos favorables energéticamente que los últimos mencionados.') to_use = list(set(ids) - rmsd_reject) return molecule, to_use, len(ids)*[None] # Because I did not optimize, then I add None
# ============================================================================= #exportar las imagenes de la mejor manera posible. en columnas de 1,2,3,4 o 5 # =============================================================================
[docs] def makeimg(mols:List[Chem.rdchem.Mol], **keywords): """Simple function to create a picture of the input molecules. The file ``used_mols.png`` will be createad. Parameters ---------- mols : List[Chem.rdchem.Mol] list of molecules """ ms = [mol for mol in mols if mol is not None] _ = [AllChem.Compute2DCoords(m) for m in ms] cant = len(ms) if cant==1: best=1 else: div = range(2,6) tup = [[i - cant%i,i] for i in div] for t in tup: if t[0] == t[1]: t[0] = 0 minimum_rest = min(tup, key=lambda x: x[0]) for t in tup: if t[0] == minimum_rest[0] and t[1]>minimum_rest[1]: minimum_rest = t best = minimum_rest[1] if 'legends' in keywords: legends = keywords['legends'] else: legends = [f'Mol: {x+1}' for x in range(len(ms))] img=Draw.MolsToGridImage(ms,molsPerRow=best,subImgSize=(700,700), legends=legends) if len(ms) > 1: img.save('used_mols.png') else: img.save(f'{legends[0]}.png')
[docs] def main(suppl:str, numConfs:int = 10, rdkit_d_RMSD:float = 0.2, UFF:bool = False, rdkit_numThreads:int = 0, mopac_keywords:str = 'PM7 precise ef xyz geo-ok t=3h THREADS = 2') -> List[str]: #EPS=78.4 """Generate conformers stochastically for the supplied molecules and filter them out based on geometrical and energetic criterions. Parameters ---------- suppl : str Path for the molecule file. It could be:.pdb, .mol2, .mol, .sdf, .smi. In case of .smi you can define several SMILES. and for all of them the configuration will be generated. TODO: do the same for sdf files numConfs : int, optional Maximum number of conformation to generate, by default 10 rdkit_d_RMSD : float, optional RMSD to filter out redundant conformations, geometric filter, by default 0.2 UFF : bool, optional If True a classical mechanical optimization will be performed, by default False rdkit_numThreads : int, optional Number of CPUs to use, by default 0 (use all) mopac_keywords : str, optional This is head of a .mop file, basically a definition of the sei-empirical job. Check the `MOPAC Documentation <http://openmopac.net/>`_ for more information , by default 'PM7 precise ef xyz geo-ok t=3h THREADS = 2' Returns ------- List[str] List of names of the used molecules Raises ------ ValueError If in the .smi there are invalid SMILES ValueError If the extension of molecule is not: .pdb, .mol2, .mol, .sdf or .smi. ValueError If the molecule was not understood by RDKit """ name, ext = os.path.splitext(os.path.basename(suppl)) print('Resumen\n') if ext == ".smi": with open(suppl, 'rt') as file: smiles = file.readlines() # ============================================================================= # Comprobamos que los SMILES esten bien # expuestas con la funcion molfilter y exportamos .sdf # ============================================================================= Errors_mols = [] for i, item in enumerate(smiles): try: Chem.MolFromSmiles(item).GetNumAtoms() except: Errors_mols.append(i+1) if len(Errors_mols) != 0: raise ValueError(f'Las moléculas con Ids: {Errors_mols} no son estructuras SMILES correctas para RDKit. Por favor, retirelas del .smi.') mols = [(f'conf_{name}_{i+1}', Chem.MolFromSmiles(smile)) for (i,smile) in enumerate(smiles)] elif ext == '.pdb': mols = [(f"conf_{name}", Chem.MolFromPDBFile(suppl))] elif ext == '.mol2': mols = [(f"conf_{name}", Chem.MolFromMol2File(suppl))] # elif ext == '.mol': # mols = [(f"conf_{name}", Chem.MolFromMolFile(suppl))] elif ext in ['.mol', '.sdf']: mols = [(f"conf_{name}", Chem.MolFromMolFile(suppl))] else: raise ValueError(f'The structure must be one of the following extensions: .pdb, .mol2, .mol, .sdf, .smi. {suppl} was provided') if None in [mol[1] for mol in mols]: raise ValueError(f"{suppl} is not understanded by RDKit") # ============================================================================= # Generamos conformaciones que cumplen con las condiciones de rmsd # expuestas con la funcion molfilter y exportamos .sdf # ============================================================================= makeimg([mol[1] for mol in mols], legends = [mol[0] for mol in mols]) for (i, mol) in enumerate(mols): print(f'Se están generando {numConfs} conformaciones para la molécula con Id = {i+1}...') molecule, index, opt = confgen(mol[1], rdkit_d_RMSD, numConfs, rdkit_numThreads = rdkit_numThreads, UFF = UFF) natoms = molecule.GetNumAtoms() sdf_writer = Chem.SDWriter(f"{mol[0]}.sdf") for idx in index: sdf_writer.write(molecule, confId=idx) sdf_writer.close() print(f'{mol[0]}: {len(index)}/{numConfs}') # ============================================================================= # Generando los .mop # ============================================================================= print(f"Archivos de salida: {mol[0]}.sdf, {mol[0]}.mop") # TODO Change this part to work properlly with the attribute _Name of the molecule with open(f"{mol[0]}.sdf", 'rt') as file: lines = file.readlines() final=open(f"{mol[0]}.mop",'w') #Toma los valores reales que se usaron para imprimir el .sdf y asi buscar los opt de verdad, pq si se elimino una estructura tengo que tenerlo en cuenta cont = 0 #Lo inicializao en -1 pq la cantidad de veces que se encontrara la palabra RDKit coincide con la cantidad con el len(to_use[0]) #todo esto para que en el .mop me saque el valor de energia optenido en la optimizacion for k, line in enumerate(lines): if 'RDKit 3D' in line: chunk = lines[(k+3):(k+3+(natoms))] sliced = [] for c in chunk: sliced.append(c.split()) df = pd.DataFrame(sliced) to_print = df[[3, 0, 1, 2]] final.write(mopac_keywords+'\n') try: comments = f"{mol[0]} E_UFF = {round(float(opt[index[cont]]), 3)}" except: comments = f"{mol[0]} E_UFF = {opt[index[cont]]}" final.write(comments+'\n') final.write('CELL: %d\n' % (cont+1)) to_print.to_string(final, header=False, index=False) final.write('\n0\n') cont += 1 print('Número de átomos: %d\n'% (natoms)) final.close() mol_names = [mol[0] for mol in mols] return mol_names
if __name__ == '__main__': pass