-
Notifications
You must be signed in to change notification settings - Fork 3
/
generate.py
executable file
·75 lines (60 loc) · 3.06 KB
/
generate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/usr/bin/env python
"""
datasink: A Pipeline for Large-Scale Heterogeneous Ensemble Learning
Copyright (C) 2013 Sean Whalen
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see [http://www.gnu.org/licenses/].
"""
from itertools import product
from os import environ, system
from os.path import abspath, dirname, exists
from sys import argv
from common import load_arff, load_properties
from sklearn.externals.joblib import Parallel, delayed
def classify(parameters):
working_dir, project_path, classifier, fold, bag = parameters
expected_filenames = ['%s/%s/predictions-%s-%02i.csv.gz' % (project_path, classifier.split()[0], fold, bag)] + ['%s/%s/validation-%s-%02i-%02i.csv.gz' % (project_path, classifier.split()[0], fold, nested_fold, bag) for nested_fold in nested_fold_values]
if sum(map(exists, expected_filenames)) == len(expected_filenames):
return
cmd = 'groovy -cp %s %s/Pipeline.groovy %s %s %s %s' % (classpath, working_dir, project_path, fold, bag, classifier)
if use_cluster:
cmd = '%s \"%s\"' % (cluster_cmd, cmd)
system(cmd)
# ensure project directory exists
project_path = abspath(argv[1])
assert exists(project_path)
# load and parse project properties
p = load_properties(project_path)
classifiers_fn = '%s/%s' % (project_path, p['classifiersFilename'])
input_fn = '%s/%s' % (project_path, p['inputFilename'])
assert exists(input_fn)
# generate cross validation values for leave-one-value-out or k-fold
assert ('foldAttribute' in p) or ('foldCount' in p)
if 'foldAttribute' in p:
df = load_arff(input_fn)
fold_values = df[p['foldAttribute']].unique()
else:
fold_values = range(int(p['foldCount']))
nested_fold_values = range(int(p['nestedFoldCount']))
bag_count = int(p['bagCount'])
bag_values = range(bag_count) if bag_count > 1 else [0]
# ensure java's classpath is set
classpath = environ['CLASSPATH']
# command for cluster execution if enabled
use_cluster = False if 'useCluster' not in p else p['useCluster'] == 'true'
cluster_cmd = 'rc.py --cores 1 --walltime 06:00:00 --queue small --allocation acc_9'
# load classifiers from file, skip commented lines
classifiers = filter(lambda x: not x.startswith('#'), open(classifiers_fn).readlines())
classifiers = [_.strip() for _ in classifiers]
working_dir = dirname(abspath(argv[0]))
n_jobs = 1 if use_cluster else -1
all_parameters = list(product([working_dir], [project_path], classifiers, fold_values, bag_values))
Parallel(n_jobs = n_jobs, verbose = 50)(delayed(classify)(parameters) for parameters in all_parameters)