-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
141 lines (104 loc) · 3.38 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
###### import ######
import numpy as np
try:
from rdkit import Chem
from rdkit.Chem import AllChem
except:
pass
###### import ######
def plot_hist(prefix_name, prediction, label):
import seaborn as sns
import matplotlib.pyplot as plt
figure_name = prefix_name + "_histogram.png"
positive_prediction = [prediction[i] for i in range(len(label)) if label[i]==1]
negative_prediction = [prediction[i] for i in range(len(label)) if label[i]==0]
sns.distplot(positive_prediction, hist=True, kde=False, bins=50, color = 'blue', label = 'positive')
sns.distplot(negative_prediction, hist=True, kde=False, bins=50, color = 'red', label = 'negative')
plt.legend()
plt.savefig(figure_name)
return
def replace_strange_symbol(text):
for i in "[]'\n/":
text = text.replace(i,'_')
return text
# xml read blog: https://blog.csdn.net/yiluochenwu/article/details/23515923
def walkData(root_node, prefix, result_list):
temp_list =[prefix + '/' + root_node.tag, root_node.text]
result_list.append(temp_list)
children_node = root_node.getchildren()
if len(children_node) == 0:
return
for child in children_node:
walkData(child, prefix = prefix + '/' + root_node.tag, result_list = result_list)
def dynamic_programming(s1, s2):
arr2d = [[0 for i in s2] for j in s1]
if s1[0] == s2[0]:
arr2d[0][0] = 1
for i in range(1, len(s1)):
if s1[i]==s2[0]:
arr2d[i][0] = 1
else:
arr2d[i][0] = arr2d[i-1][0]
for i in range(1,len(s2)):
if s2[i]==s1[0]:
arr2d[0][i] = 1
else:
arr2d[0][i] = arr2d[0][i-1]
for i in range(1,len(s1)):
for j in range(1,len(s2)):
if s1[i] == s2[j]:
arr2d[i][j] = arr2d[i-1][j-1] + 1
else:
arr2d[i][j] = max(arr2d[i-1][j], arr2d[i][j-1])
return arr2d[len(s1)-1][len(s2)-1]
def get_path_of_all_xml_file():
input_file = "../../data/all_xml"
with open(input_file, 'r') as fin:
lines = fin.readlines()
input_file_lst = [ i.strip() for i in lines]
return input_file_lst
def remove_multiple_space(text):
text = ' '.join(text.split())
return text
def nctid_2_xml_file_path(nctid):
assert len(nctid)==11
prefix = nctid[:7] + "xxxx"
datafolder = os.path.join("./ClinicalTrialGov/", prefix, nctid+".xml")
return datafolder
def fingerprints_from_mol(mol):
fp = AllChem.GetMorganFingerprint(mol, 3, useCounts=True, useFeatures=True)
size = 2048
nfp = np.zeros((1, size), np.int32)
for idx,v in fp.GetNonzeroElements().items():
nidx = idx%size
nfp[0, nidx] += int(v)
return nfp
def smiles2fp(smiles):
try:
mol = Chem.MolFromSmiles(smile)
fp = fingerprints_from_mol(mol)
return fp
except:
return np.zeros((1, 2048), np.int32)
def smiles_lst2fp(smiles_lst):
fp_lst = [smiles2fp(smiles) for smiles in smiles_lst]
fp_mat = np.concatenate(fp_lst, 0)
fp = np.mean(fp_mat,0)
return fp
if __name__ == "__main__":
text = "interpret_result/NCT00329602__completed____1__1.7650960683822632__phase 4__['restless legs syndrome']__['placebo', 'ropinirole'].png"
print(replace_strange_symbol(text))
# if __name__ == "__main__":
# input_file_lst = get_path_of_all_xml_file()
# print(input_file_lst[:5])
# '''
# input_file_lst = [
# 'ClinicalTrialGov/NCT0000xxxx/NCT00000102.xml',
# 'ClinicalTrialGov/NCT0000xxxx/NCT00000104.xml',
# 'ClinicalTrialGov/NCT0000xxxx/NCT00000105.xml',
# ... ]
# '''
# if __name__ == "__main__":
# s1 = "328943"
# s2 = "13785"
# assert dynamic_programming(s1, s2)==2