-
Notifications
You must be signed in to change notification settings - Fork 15
/
pacumen_train.py
executable file
·135 lines (104 loc) · 3.73 KB
/
pacumen_train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#!/usr/bin/env python
import pacumen
import getopt
import sys
import scipy.sparse as sparse
import oneclasstree
import cPickle as pickle
import numpy
import multinormal
def print_help():
print '''
train a classifier for a protocol over an encryptd tunnel given pcaps
you can specify multiple pcaps that belong to the target protocol and
multiple pcaps that belong do not belong to the target protocol
at least one of each must be supplied
an output filename must also be supplied
'''
print 'usage: %s [-T target] [-N nontarget] [-B bias] <outputfilename>' % (sys.argv[0])
print '''
-T target: specify a pcap with the target protocol, may be used multiple times
-N nontarget: specifiy a pcap without the target protocol, may be used multiple times
-B float: bias towards >= indicates positive class (default: auto), -bias may be used for the opposite
(optional) -D max depth: maximum depth of decision tree (default: 9)
(optional) -M minimum bias: minimum bias to use with -B auto
'''
options,remainder = getopt.getopt(sys.argv[1:], 'T:N:hD:B:M:Gk:')
targets = set()
nontargets = set()
maxdepth = 9
bias = 'auto'
gaussian = False
maxbias = 8.0
minbias = 0.0
k = 5
for opt,arg in options:
if opt == '-T':
targets.add(arg)
elif opt == '-N':
nontargets.add(arg)
elif opt == '-h':
print_help()
exit()
elif opt == '-D':
maxdepth = int(arg)
elif opt == '-B':
bias = float(arg) if arg.lower().strip() != 'auto' else 'auto'
elif opt == '-M':
minbias = float(arg)
elif opt == '-G':
gaussian = True
elif opt == '-k':
k = int(arg)
if len(remainder) != 1 or len(targets) == 0 or len(nontargets) == 0:
print_help()
exit()
outfilename = remainder[0]
print 'reading pcaps'
ntmat = sparse.vstack([pacumen.make_feature_vectors_from_pcap(pcap) for pcap in nontargets]).tocsr()
tmat = sparse.vstack([pacumen.make_feature_vectors_from_pcap(pcap) for pcap in targets]).tocsr()
classifier = None
print 'have %d rows of target data and %d rows of non-target data' % (tmat.shape[0], ntmat.shape[0])
all_data = sparse.vstack([ntmat, tmat]).tocsr()
oracle = oneclasstree.bincount_oracle(all_data)
best = None
if gaussian:
print 'using mutlivariate gaussian likelihood function'
classifier = multinormal.MultiNormalLikelihood(tmat, all_data, k=k)
elif bias != 'auto':
print 'training'
classifier = oneclasstree.OneClassTree(tmat, oracle, maxdepth=maxdepth, target=0.9, bias=bias)
else: # need to try many biases
zerovector = sparse.csr_matrix((1, tmat.shape[1]), dtype=tmat.dtype)
maxiters = 22
# try it with no bias and see if that works
classifier = oneclasstree.OneClassTree(tmat, oracle, maxdepth=maxdepth, target=0.9, bias=minbias)
zvp = classifier.classify(zerovector)[0]
zvp /= 2
zvp /= numpy.sum(zvp)
zvp = zvp[1]
if zvp >= 0.5:
# uh oh, gotta find a bias
for i in range(maxiters):
current = numpy.mean([maxbias, minbias])
classifier = oneclasstree.OneClassTree(tmat, oracle, maxdepth=maxdepth, target=0.9, bias=current)
# calculate P(POSITIVE|ZEROVECTOR)
zvp = classifier.classify(zerovector)[0]
zvp /= 2
zvp /= numpy.sum(zvp)
zvp = zvp[1]
print 'bias %f, p(positive|zerovector) = %f' % (current, zvp)
if zvp < 0.5: # we may be able to use a smaller bias
maxbias = current
if best == None or current < best[0]:
best = (current,classifier)
else: # we need a larger bias
minbias = current
if best != None:
print 'using bias %f' % (best[0])
classifier = best[1]
print 'writing classifier to file'
with open(outfilename, 'wb') as f:
pickle.dump(classifier, f, protocol=2)
if hasattr(classifier, 'selected_columns'):
print 'Used packet sizes: %s' % (classifier.selected_columns(),)