Source code for aleimi.boltzmann

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import rmsd
import numpy as np
import pandas as pd
import os
from aleimi import utils



[docs]
def arc_reader(arc: str):
    """This , for sure, will change in the future to a more object-oriented paradigm
    It reads a arc MOPAC file and give a tuple of

    Parameters
    ----------
    arc : str
        The arc file path

    Returns
    -------
    list[tuple]
        A list of tuple sorted by energy. Each tuple contains:
            #. cells: conformer identifier,
            #. HeatsOfFormation_kcalmol: self-explanatory,
            #. CONTAINER__H: Coordinates numpy array of the heavy atoms with shape (Number of heavy atoms, 3),
            #. Class_E: The classic energy calculated during :meth:`aleimi.confgen.main`
    """
    with open(arc, 'rt', encoding='latin-1') as a:
        lines = a.readlines()

    # getting data from arc
    HeatsOfFormation_kcalmol = []
    cells = []
    Class_E = []
    # finding No. of atoms
    for line in lines:
        if 'Empirical Formula' in line:
            natoms = int(line.split()[-2])
            break

    # CONTAINER = []
    CONTAINER__H = []
    # cart = []
    cart__H = []
    # atoms = []
    for i, line in enumerate(lines):

        if 'HEAT OF FORMATION' in line:
            #I am getting the one in kcal/mol
            HeatsOfFormation_kcalmol.append(float(line.split()[-5]))  # HEAT OF FORMATION       =       -180.69875 KCAL/MOL =    -756.04358 KJ/MOL
       
        elif 'Empirical Formula' in line:
            try:
                Class_E.append(float(lines[i+3].split('=')[-1].split()[0]))  # las dos versiones con y sin optimizacion
            except:
                Class_E.append(None)
                    
            cells.append(int(lines[i+4].split(':')[1]))
    
        elif ('FINAL GEOMETRY OBTAINED' in line):
            chunk = lines[i+4:i+4+natoms]
            cart__H = []
            for c in chunk:
                # atoms.append(c.split()[0])
                # cart.append([c.split()[0].strip(), float(c.split()[1]), float(c.split()[3]), float(c.split()[5])])
                if c.split()[0] != 'H':
                    cart__H.append([c.split()[1], c.split()[3], c.split()[5]])   # No estoy tomando los atomos, solamente coordenadas c.split()[0].strip(),
            # CONTAINER.append(np.asarray(pd.DataFrame(cart)))
            CONTAINER__H.append(np.array(cart__H, dtype=np.float64))
    # atoms = (np.asarray(atoms))
    # .... organizing
    paired = list(zip(cells, HeatsOfFormation_kcalmol, CONTAINER__H, Class_E)) # Esto genera un arreglo de tuplas, me une los arreglos
    ORDERED = sorted(paired, key=lambda x: x[1])  #Esto ordena la tupla segun la energia de menor a mayor
    return ORDERED  # , atoms]




[docs]
def out_reader(out):
    """This , for sure, will change in the future to a more object-oriented paradigm
    It reads a out MOPAC file and give a tuple of

    Parameters
    ----------
    out : str
        The arc file path

    Returns
    -------
    list[tuple]
        A list of tuple sorted by energy. Each tuple contains:
            #. cells: conformer identifier,
            #. HeatsOfFormation_kcalmol: self-explanatory,
            #. CONTAINER__H: Coordinates numpy array of the heavy atoms with shape (Number of heavy atoms, 3)
            #. Class_E: The classic energy calculated during :meth:`aleimi.confgen.main`
    """
    f = open(out, 'r')
    chunk = []
    HeatsOfFormation_kcalmol = []
    cells = []
    Class_E = []
    CONTAINER__H = []
    cart__H = []

    # getting data from out                                                       #
    # finding No. of atoms
    while True:
        line = f.readline()
        if "Empirical Formula" in line:
            natoms = int(line.split()[-2])
            break

    f = open(out, 'r')
    while True:
        line = f.readline()
        if len(line) == 0:
            break
               
        if 79*'-' in line:
            while True:
                line = f.readline()
                if (79*'*' in line) or (len(line) == 0):break
                if "E_UFF = " in line:
                    split = line.split('=')[-1]
                    try:
                        Class_E.append(float(split))
                    except:
                        Class_E.append(None)

                elif 'CELL' in line:
                    cells.append(int(line.split(':')[1]))
                    
                elif 'HEAT OF FORMATION' in line:
                    HeatsOfFormation_kcalmol.append(float(line.split()[-5]))
                elif 'CARTESIAN COORDINATES' in line:
                    utils.ignoreLines(f, 1)
                    cont = 0
                    chunk = []
                    while cont < natoms:
                        chunk.append(f.readline())
                        cont += 1
                    cart__H = []
                    for c in chunk:
                        if c.split()[1] != 'H':
                            cart__H.append([c.split()[2], c.split()[3], c.split()[4]])  # No estoy tomando los atomos, solamente coordenadas c.split()[0].strip(),

            CONTAINER__H.append(np.array(cart__H, dtype=np.float64))
    f.close
    # .... organizing
    paired = list(zip(cells, HeatsOfFormation_kcalmol, CONTAINER__H, Class_E))  # Esto genera un arreglo de tuplas, me une los arreglos
    ORDERED = sorted(paired, key=lambda x: x[1])  # Esto ordena la tupla segun la energia de menor a mayor
    return ORDERED




[docs]
def main(file_path: str, Bd_rmsd: float = 1.0, Bd_E: float = 0.0, BOutPath: bool = True) -> pd.DataFrame:
    """It reads the MOPAC output file (arc or out) and create a Boltzmann table

    Parameters
    ----------
    file_path : str
        MOPAC output (.arc or .out). It will be read with :meth:`aleimi.boltzmann.arc_reader` or :meth:`aleimi.boltzmann.out_reader`
        depending on the extension.
    Bd_rmsd : float, optional
        RMSD to filter out redundant conformations, geometric filter, by default 1.0
    Bd_E : float, optional
        Energy difference in kJ to filter out redundant conformations, geometric filter, by default 1.0, by default 0.0
    BOutPath : bool, optional
        Directory to ouput the table {file_name}_boltzmann.csv", by default True

    Returns
    -------
    pd.DataFrame

    A Table with columns:
        #. `cell`: conformer identifier,
        #. `Class_E`: Classic energy from the RDKit optimization. in this example is `NaN` because we did not perform this optimization,
        #. `HeatOfFormation_kcal`: self-explanatory [kcal/mol],
        #. `Emin_Ei`: Difference in energy between the lower and the i-th conformer in [kcal/mol],
        #. `qi__Pi/Pmin__e^(Emin_Ei)/KbT`: Boltzmann factors,
        #. `Fraction_%__100*qi/q`: Occupancy of each conformer,

    Raises
    ------
    ValueError
        _description_
    """
    name, ext = os.path.splitext(os.path.basename(file_path))
    
    if ext == '.arc':
        ordered = (arc_reader(file_path))
    elif ext == '.out':
        ordered = (out_reader(file_path))
    else:
        raise ValueError(f"{file_path} does not have .arc or .out extension. Therefore is not readable by ALEIMI.")

    if Bd_E:
        for i, _ in enumerate(ordered):
            to_trash_degenerated = []
            for idx, _ in enumerate(ordered):
                if i < idx:
                    Ei = ordered[i][1]
                    Eidx = ordered[idx][1]
                    delta = abs(Ei - Eidx)
                    if delta <= Bd_E:
                        # =============================================================
                        #     CHECKING Geometric degeneracy
                        # =============================================================
                        P = ordered[i][2]
                        Q = ordered[idx][2]

                        RMSD = rmsd.kabsch_rmsd(P, Q, translate=True)

                        if RMSD <= Bd_rmsd:
                            # reject identical structure
                            to_trash_degenerated.append(idx)
            # =========================================================================
            #     FOR EACH STRUCTURE, eliminate degenerated and save lot of time
            # =========================================================================
            to_trash_degenerated = sorted(to_trash_degenerated, reverse=True)
            _ = [ordered.pop(x) for x in to_trash_degenerated]

    else:
        for i, x in enumerate(range(len(ordered))):
            to_trash_degenerated = []
            for idx, y in enumerate(range(len(ordered))):
                if i < idx:

                    # =============================================================
                    #     CHECKING Geometric degeneracy
                    # =============================================================
                    P = ordered[i][2]
                    Q = ordered[idx][2]

                    RMSD = rmsd.kabsch_rmsd(P, Q, translate=True)

                    if RMSD <= Bd_rmsd:
                        # reject identical structure and kept the lowest energy (because ordered() is ordered using the enrgy, so idx always will have a grater enrgy)
                        to_trash_degenerated.append(idx)

            # =========================================================================
            #     FOR EACH STRUCTURE, eliminate degenerated and save lot of time
            # =========================================================================
            to_trash_degenerated = sorted(to_trash_degenerated, reverse=True)
            [ordered.pop(x) for x in to_trash_degenerated]

    # =============================================================================
    #      WORKING with UNDEGENERATED. Cambie la manera de calculos los parametros:
    # Me base en: James B. Foresman - Exploring Chemistry With Electronic Structure Methods 3rd edition (2015) pag 182
    # y Mortimer_Physical Chemistry_(3rd.ed.-2008) pag 1045
    # =============================================================================
    Kb = 1.987204259E-3                        # kcal/(mol⋅K)
    T = 298.15                                 # Absolute T (K)
    DF = pd.DataFrame()
    cells = [ordered[i][0] for i, x in enumerate(ordered)]
    Class_E = [ordered[i][3] for i, x in enumerate(ordered)]
    HeatsOfFormation_kcalmol = [ordered[i][1] for i, x in enumerate(ordered)]
    MinHeatsOfFormation_kcalmol = min(HeatsOfFormation_kcalmol)
    relative_kcalmol = [MinHeatsOfFormation_kcalmol - x for x in HeatsOfFormation_kcalmol]
    qi = [np.exp(E_r/(Kb*T)) for E_r in relative_kcalmol]
    q = sum(qi)
    Fraction = [100*i/q for i in qi]
    # Z = [np.e**(-(E/(k*T))) for E in energy_kcal] #no pudo calcular Z: verflowError: (34, 'Result too large')
    # Pi_b = [(np.e**-(E/(k*T)))/Z for E in energy_kcal]
    # =============================================================================
    #     DATAFRAME
    # =============================================================================
    DF['cell'] = cells
    DF['Class_E'] = Class_E
    DF['HeatOfFormation_kcal/mol'] = HeatsOfFormation_kcalmol
    DF['Emin_Ei'] = relative_kcalmol
    DF['qi__Pi/Pmin__e^(Emin_Ei)/KbT'] = qi
    DF['Fraction_%__100*qi/q'] = Fraction

    if BOutPath:
        with open(f"{name}_boltzmann.csv", 'wt') as rp:
            DF.to_csv(rp)

    return DF



if __name__ == '__main__':
    pass