In [12]:
import random
from rdkit import Chem
import pandas as pd

# Biologically safe fragments
safe_atoms = ['C', 'N', 'O']
safe_branches = ['C(C)C', 'C(=O)N', 'N(C)C', 'C(=O)OC', 'O=C', 'CCO']
safe_aromatics = ['c1ccccc1', 'c1ccncc1']
safe_rings = ['C1CC1', 'C1CCC1', 'C1=CC=CC=C1']

def generate_safe_smiles():
    core = random.choice(safe_atoms)
    for _ in range(random.randint(2, 5)):
        part = random.choice(safe_branches + safe_aromatics + safe_rings)
        core += random.choice(['', '=', '-']) + part
    return core

# Generate and validate
safe_smiles = []
while len(safe_smiles) < 50:
    smi = generate_safe_smiles()
    mol = Chem.MolFromSmiles(smi)
    if mol:
        cano = Chem.MolToSmiles(mol)
        if cano not in safe_smiles:
            safe_smiles.append(cano)

# Save
df = pd.DataFrame(safe_smiles, columns=["SMILES"])
df.to_csv("biologically_safe_molecules.csv", index=False)
df.head()


[13:10:24] Explicit valence for atom # 1 O, 4, is greater than permitted
[13:10:24] Can't kekulize mol.  Unkekulized atoms: 13 14 15 16 17
[13:10:24] Explicit valence for atom # 3 O, 3, is greater than permitted
[13:10:24] Can't kekulize mol.  Unkekulized atoms: 5 6 7 8 9
[13:10:24] Explicit valence for atom # 1 O, 3, is greater than permitted
[13:10:24] Can't kekulize mol.  Unkekulized atoms: 6 7 8 9 10
[13:10:24] Can't kekulize mol.  Unkekulized atoms: 10 11 12 13 14
[13:10:24] Explicit valence for atom # 4 C, 5, is greater than permitted
[13:10:24] Explicit valence for atom # 5 O, 3, is greater than permitted
[13:10:24] Explicit valence for atom # 10 O, 3, is greater than permitted
[13:10:24] Explicit valence for atom # 1 C, 5, is greater than permitted
[13:10:24] Explicit valence for atom # 4 O, 3, is greater than permitted
[13:10:24] Can't kekulize mol.  Unkekulized atoms: 16 17 18 19 20
[13:10:24] Explicit valence for atom # 3 O, 3, is greater than permitted
[13:10:24] Explicit v

Unnamed: 0,SMILES
0,Cc1ccccc1N(C)C
1,NC1CC1C1CCC1C1CC1
2,C=CCOC1CCC1c1ccccc1
3,NCCOC(=O)NCCOC1CCC1
4,CC(C)CC(=O)NC(=O)OCc1ccccc1CCO


In [14]:
from google.colab import files

# Download the CSV to your computer
files.download("biologically_safe_molecules.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [15]:
!pip install pubchempy
import pubchempy as pcp
from rdkit import Chem

def is_known_pubchem(smi):
    try:
        cid = pcp.get_compounds(smi, namespace='smiles')
        return len(cid) > 0
    except:
        return False


Collecting pubchempy
  Downloading PubChemPy-1.0.4.tar.gz (29 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pubchempy
  Building wheel for pubchempy (setup.py) ... [?25l[?25hdone
  Created wheel for pubchempy: filename=PubChemPy-1.0.4-py3-none-any.whl size=13818 sha256=bca7d2f4bdfdbad6567501b2732fbca8bff1735a5b96fcc7baeb4df31ae0feb4
  Stored in directory: /root/.cache/pip/wheels/8b/e3/6c/3385b2db08b0985a87f5b117f98d0cb61a3ae3ca3bcbbd8307
Successfully built pubchempy
Installing collected packages: pubchempy
Successfully installed pubchempy-1.0.4


In [16]:
df["Known"] = df["SMILES"].apply(is_known_pubchem)
df_novel = df[df["Known"] == False].reset_index(drop=True)
df_novel.to_csv("novel_only_molecules.csv", index=False)


In [18]:
from google.colab import files

# Download the CSV to your computer
files.download("novel_only_molecules.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [36]:

while len(safe_smiles) < 30000:
    smi = generate_safe_smiles()
    mol = Chem.MolFromSmiles(smi)
    if mol:
        cano = Chem.MolToSmiles(mol)
        if cano not in safe_smiles:
            safe_smiles.append(cano)


In [37]:


def is_known_pubchem(smi):
    try:
        cid = pcp.get_compounds(smi, namespace='smiles')
        return len(cid) > 0
    except:
        return False


In [33]:
df["Known"] = df["SMILES"].apply(is_known_pubchem)
df_novel = df[df["Known"] == False].reset_index(drop=True)
df_novel.to_csv("novel_only_molecules1lakh.csv", index=False)


In [34]:
from google.colab import files

# Download the CSV to your computer
files.download("novel_only_molecules1lakh.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>