I am trying to define a function that I can use on multiple dataframes with SMILE string as values. The function should calculate the molecular descriptor values, so I can add them to the dataframe and use it in a machine learning model.
cached(cache={})
def get_molecular_descriptor_calculator():
"""Cache call to retreive molecular descriptors function"""
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator
descriptor_vals = ['ExactMolWt', 'FpDensityMorgan1', 'FpDensityMorgan2', 'FpDensityMorgan3',
'FractionCSP3', 'HallKierAlpha', 'HeavyAtomCount', 'HeavyAtomMolWt', 'Ipc', 'Kappa1',
'Kappa2', 'Kappa3', 'LabuteASA', 'MaxAbsEStateIndex', 'MaxAbsPartialCharge', 'MaxEStateIndex',
'MaxPartialCharge', 'MinAbsEStateIndex', 'MinAbsPartialCharge', 'MinEStateIndex', 'MinPartialCharge',
'MolLogP', 'MolMR', 'MolWt']
return MolecularDescriptorCalculator(descriptor_vals), descriptor_vals
def vudf_compute_molecular_properties(smile_string_df: T.PandasDataFrame[str]) -> T.PandasSeries[dict]:
"""Calculate molecular descriptor from SMILE values in a df"""
#Load the libraries:
from rdkit import Chem
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator
smile_string_df.columns = ['SMILES']
descriptor_calculator, descriptors = get_molecular_descriptor_calculator()
def smiles_to_descriptors(smile):
"""Helper function to apply to batches of rows"""
mol = Chem.MolFromSmiles(smile)
mol_property_vals = descriptor_calculator.CalcDescriptors(mol)
return dict(zip(descriptors, mol_property_vals))
#smile_string_df['descriptors'] = smile_string_df.SMILES.apply(smi_to_descriptors)
return smile_string_df.SMILES.apply(smiles_to_descriptors)
I got an error that says : NameError Traceback (most recent call last) Cell In[33], line 15 6 descriptor_vals = ['ExactMolWt', 'FpDensityMorgan1', 'FpDensityMorgan2', 'FpDensityMorgan3', 7 'FractionCSP3', 'HallKierAlpha', 'HeavyAtomCount', 'HeavyAtomMolWt', 'Ipc', 'Kappa1', 8 'Kappa2', 'Kappa3', 'LabuteASA', 'MaxAbsEStateIndex', 'MaxAbsPartialCharge', 'MaxEStateIndex', 9 'MaxPartialCharge', 'MinAbsEStateIndex', 'MinAbsPartialCharge', 'MinEStateIndex', 'MinPartialCharge', 10 'MolLogP', 'MolMR', 'MolWt'] 12 return MolecularDescriptorCalculator(descriptor_vals), descriptor_vals ---> 15 def vudf_compute_molecular_properties(smile_string_df: T.PandasDataFrame[str]) -> T.PandasSeries[dict]: 16 """Calculate molecular descriptor from SMILE values in a df""" 18 #Load the libraries:
NameError: name 'T' is not defined