forked from nyu-dl/conditional-molecular-design-ssvae
-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocessing.py
67 lines (44 loc) · 1.81 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors, QED
from tensorflow.contrib.keras import preprocessing
class ZINC(object):
char_set=[' ','1','2','3','4','5','6','7','8','9','-','#','(',')','[',']','+','=','B','Br','c','C','Cl','F','H','I','N','n','O','o','P','p','S','s','Si','Sn']
@staticmethod
def get_property(smi):
try:
mol=Chem.MolFromSmiles(smi)
property = [Descriptors.ExactMolWt(mol), Descriptors.MolLogP(mol), QED.qed(mol)]
except:
property = 'invalid'
return property
@staticmethod
def canonocalize(smi):
return Chem.MolToSmiles(Chem.MolFromSmiles(smi))
@staticmethod
def vectorize(list_input):
one_hot = np.zeros((list_input.shape[0], list_input.shape[1]+4, len(ZINC.char_set)), dtype=np.int32)
for si, ss in enumerate(list_input):
for cj, cc in enumerate(ss):
one_hot[si,cj+1,cc] = 1
one_hot[si,-1,0] = 1
one_hot[si,-2,0] = 1
one_hot[si,-3,0] = 1
return one_hot[:,0:-1,:], one_hot[:,1:,:]
@staticmethod
def smiles_to_seq(smiles):
char_to_int = dict((c,i) for i,c in enumerate(ZINC.char_set))
list_seq=[]
for s in smiles:
seq=[]
j=0
while j<len(s):
if j<len(s)-1 and s[j:j+2] in ZINC.char_set:
seq.append(char_to_int[s[j:j+2]])
j=j+2
elif s[j] in ZINC.char_set:
seq.append(char_to_int[s[j]])
j=j+1
list_seq.append(seq)
list_seq = preprocessing.sequence.pad_sequences(list_seq, padding='post')
return list_seq