Source code for aleimi.extractor

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import pandas as pd
import os
from aleimi import templates, utils
from typing import List

# def extract(arc_file, boltzmann_file, energy_cut = 2, conformer_cut = None, mksh = True, mkdir = True):
# Energy cut is in kcal/mol



[docs]
def extract(boltzmann_file: str, energy_cut: float = 2.0, conformer_cut: int = None) -> List[int]:
    """Extract the conformers based on the filters: energy_cut and/or conformer_cut

    Parameters
    ----------
    boltzmann_file : str
        The boltzmann file generated with `:meth: aleimi.boltzmann.main`
    energy_cut : float, optional
        Maximum difference in energy with respect the conformer with lowest energy, by default 2.0
    conformer_cut : int, optional
        Maximum number of conformers to export, by default None

    Returns
    -------
    List[int]
        The list of ``cell`` identifiers
    """

    df = pd.read_csv(boltzmann_file)
    indx_to_extract = []
    if energy_cut:
        df_subset = df[df.Emin_Ei >= -energy_cut]
        indx_to_extract += df_subset.cell.tolist()

    if conformer_cut:
        df_subset = df.iloc[:conformer_cut]
        indx_to_extract += df_subset.cell.tolist()

    if not energy_cut and not conformer_cut:
        df_subset = df.iloc[:]
        indx_to_extract += df_subset.cell.tolist()

    return sorted(indx_to_extract)



def get_coords(input_file, indx_to_extract):

    file_ext = os.path.basename(input_file).split('.')[-1]
    file_name = os.path.basename(input_file)[:-(len(file_ext)+1)]

    if file_ext == 'arc':
        with open(input_file, 'rt', encoding='latin-1') as file:
            lines = file.readlines()

        to_return = []
        for line in lines:
            if 'Empirical Formula' in line:
                natoms = int(line.split()[-2])
                break
        for i, line in enumerate(lines):
            if ('FINAL GEOMETRY OBTAINED' in line) and (int(lines[i+3].strip().split(':')[1]) in indx_to_extract):
                cell = int(lines[i+3].strip().split(':')[1])
                chunk = lines[i+4:i+4+natoms]
                sliced = []
                for c in chunk:
                    sliced.append(c.split())
                df = pd.DataFrame(sliced)
                # Creating a tuple (conf_name_Cell, coords)
                to_return.append((f"{file_name}_Cell_{cell}", df[[0, 1, 3, 5]]))
        return to_return

    elif file_ext == 'out':
        f = open(input_file, 'r')
        chunk = []
        cart = []

        # getting data from out
        # finding No. of atoms

        while True:
            line = f.readline()
            if "Empirical Formula" in line:
                natoms = int(line.split()[-2])
                f.close()
                break

        f = open(input_file, 'r')
        to_return = []
        while True:
            line = f.readline()
            if len(line) == 0:
                break

            if 79*'-' in line:
                while True:
                    line = f.readline()
                    if (79*'*' in line) or (len(line) == 0):
                        break

                    if 'CELL' in line:
                        cell = int(line.split(':')[1])

                    elif 'CARTESIAN COORDINATES' in line and cell in indx_to_extract:
                        utils.ignoreLines(f, 1)
                        cont = 0
                        chunk = []   
                        while cont < natoms:
                            chunk.append(f.readline())
                            cont += 1
                        cart = []
                        for c in chunk:
                            cart.append(c.split())
                            df = pd.DataFrame(cart)
                            to_return.append((f"{file_name}_Cell_{cell}", df[[1, 2, 3, 4]])) 
        return to_return
    else:
        raise ValueError(f"{input_file} does not have .arc or .out extension. Therefore is not readeable by ALEIMI.")


def main(
    input_file,
    boltzmann_file,
    energy_cut: float = 2,
    conformer_cut: float = 0,
    engine: str = 'psi4',
    machine: str = 'smaug',
    mkdir: bool = True,
    jobsh: bool = True,
    **keywords):

    if engine == 'psi4':
        InputExt = '.in'
    elif engine == 'orca':
        InputExt = '.inp'
    elif engine == 'gaussian':
        InputExt = '.gjf'
    else:
        print("Warning!: It was used 'in' as generic extension for the input "
              f"file for the non recognized engine: {engine}")
        InputExt = '.in'

    indx_to_extract = extract(boltzmann_file, energy_cut=energy_cut,
                              conformer_cut=conformer_cut)
    names_coords = get_coords(input_file, indx_to_extract)
    for name, coords in names_coords:
        INPUT_obj = templates.INPUT(engine, machine=machine, name=name, coords=coords, **keywords)
        if mkdir:
            utils.makedirs(name)
            INPUT_obj.write(os.path.join(name, f"{name}{InputExt}"), 'input')
            if jobsh:
                INPUT_obj.write(os.path.join(name, "job.sh"), 'jobsh')
        else:
            INPUT_obj.write(f"{name}{InputExt}", 'input')
            if jobsh:
                INPUT_obj.write(f"{name}.sh", 'jobsh')


if __name__ == '__main__':
    pass