-
Notifications
You must be signed in to change notification settings - Fork 3
/
runexp.py
1241 lines (1154 loc) · 63.1 KB
/
runexp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# coding: utf-8
"""A simple framework to run experiments with a complex workflow
- Features
- Users define *tasks*, which specify source/target files and a
command to generate target files from source files
- The system automatically computes dependencies among tasks and
execute them in order
- Tasks are executed only when:
- some targets do not exist
- some targets are older than source files (in a similar way as
`make`), or
- some sources are rebuilt in preceding tasks
- The system runs tasks in parallel
- How to use
1. Create an instance of `runexp.Workflow`
2. Call this instance with arguments `source`, `target`, and `rule`.
3. Call the method `run()`
- See the example below. A more elaborated example is found in the
end of `runexp.py`. You can see what happens by running:
`python runexp.py`
- Example
- The following example executes `ls -l`, `sort`, and then `head`.
- The order of `exp()` does not matter. The system automatically
computes dependencies among tasks, and executes them in an
appropriate order.
```
import runexp
exp = runexp.Workflow()
exp(target='input.txt', rule='ls -l > input.txt')
exp(source='input.txt', target='sorted.txt', rule='sort input.txt > sorted.txt')
exp(source='sorted.txt', target='head.txt', rule='head -n 1 sorted.txt > head.txt')
exp.run()
```
- More features
- Output dependencies of tasks into a PNG file to visualize
- Users can specify fine-grained conditions to execute a task
- `always`: always execute a task (i.e. execute a task even when
targets are newer than sources)
- `no_timestamp`: do not check timestamp (i.e. do not execute a
task even when targets are old)
- these conditions can also be overridden by command-line
arguments
- Users can specify any dependent files in addition to sources
(e.g. a script to generate targets, a directory to output files)
- `depend` files are used only for checking timestamps; i.e. tasks
are executed when any `depend` files are newer than targets
(e.g. a script is modified)
- `require` files are used only for checking their existence;
i.e. their timestamps are not checked (e.g. to create a
directory to output files)
- Users can define environment variables for each worker (a process
to execute a task)
- can be used to change the behavior of each worker in parallel
processing
- e.g. allocate a dedicated GPU to each worker
- Users can specify resource conditions on workers and tasks
- Tasks are assigned to workers that have sufficient resources to
run the task
- e.g. assign GPUs to each worker, and run a task that requires
GPUs on the worker with a sufficient number of GPUs
- Defining tasks
- You can add a task to the workflow by calling the `Workflow`
instance with the following arguments.
- `source`: input files (space-separated string or list of strings)
- `target`: target files (space-separated string or list of strings)
- `rule`: command to execute (string or list of strings)
- `depend` (optional): other dependent files (space-separated
string or list of strings)
- `require` (optional): other required files (space-separated
string or list of strings)
- `name` (optional): a short name shown in the log message
- `desc` (optional): a detailed description shown in the task list
- The following options may be specified to control the behavior:
- `always`: always execute this task (bool; default=False)
- `no_timestamp`: do not check timestamp (bool; default=False)
- `ignore_same_task`: ignore multiply added equivalent tasks
(bool; default=False); when equivalent tasks are found but this
option is False, the system shows an error.
- `ignore_error`: ignore an error of the executed command (bool;
default=False)
- `no_exec`: do not execute this task by default (bool;
default=False)
- `phony`: targets are not real files (bool; default=False); the
same as "phony targets" in `make`
- Command-line arguments
- Run `python runexp.py -h` to see the description of command-line
arguments.
- Environment variables
- Use the command-line argument `-E` to give a list of environment
variable settings for workers
- Each element of the list is a dictionary, which is set as
environment variables of each worker.
- The length of the list must be equal to the number of workers
- Alternatively you can use the argument `environments_distributed`
of the constructor or `set_options()` of Workflow
- e.g. the following example specifies the environment variable
`GPU` for two workers.
- `exp = Workflow(environments_distributed=[{'GPU': '1'}, {'GPU': '2'}])`
- Defining resource conditions
- Specify available resources for workers, and required resources
for tasks; tasks are executed on a worker with sufficient
resources
- Available resources for workers
- Use the command-line argument `-r` to give a list of available
resources for workers. Each element is a dict, which denotes
available resources for a worker.
- Alternatively you can use the argument `resources` of the
constructor or `set_options()` of Workflow
- e.g. the following example denotes that the first worker has one
GPU and 4GB memory, while the second worker has no GPU and 16GB
memory.
- `exp = Workflow(resources=[{'GPU': 1, 'Mem': 4}, {'Mem': 16}])`
- Required resources for tasks
- Specify required resources for a task by the argument `resource`
- e.g. the following example specifies that the task requires 8GB
memory.
- `exp(resource={'Mem': 8}, ...)`
"""
from __future__ import print_function, unicode_literals, absolute_import
import sys
import os
import subprocess
import argparse
import logging
import multiprocessing
from datetime import datetime
from collections import deque
from fnmatch import fnmatch
import signal
import json
import psutil
try:
from StringIO import StringIO
except ImportError:
from io import StringIO
if sys.version_info.major == 2:
def isstr(s):
return isinstance(s, basestring)
def iterdict(d):
return d.iteritems()
else:
def isstr(s):
return isinstance(s, str)
def iterdict(d):
return d.items()
logger = logging.getLogger(__name__)
#logging.basicConfig(level=logging.DEBUG, format='%(asctime)s:%(name)s:%(funcName)s:%(levelname)s: %(message)s')
#logging.basicConfig(level=logging.INFO, format='%(asctime)s:%(name)s:%(levelname)s: %(message)s')
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter('%(asctime)s:%(name)s:%(levelname)s: %(message)s'))
logger.addHandler(handler)
logger.setLevel(logging.INFO)
######################################################################
def coloring(color, text):
"""Print a text in a specified color"""
color_sequences = {
'default': '\033[0m',
'black': '\033[30m',
'red': '\033[31m',
'green': '\033[32m',
'yellow': '\033[33m',
'blue': '\033[34m',
'purple': '\033[35m',
'lightblue': '\033[36m',
'white': '\033[37m',
}
return color_sequences[color] + text + color_sequences['default']
######################################################################
class Task:
"""A single task to receive source files and produce target files"""
def __init__(self, name=None, desc=None, source=[], target=[], rule=[], depend=[], require=[], resource={}, always=False, no_timestamp=False, ignore_same_task=False, ignore_error=False, no_exec=False, phony=False):
if isstr(source):
source = source.split()
if isstr(target):
target = target.split()
if isstr(rule):
rule = rule.split('\n')
if isstr(depend):
depend = depend.split()
if isstr(require):
require = require.split()
if name is None:
name = ' '.join(target)
if not isstr(name):
raise ValueError("name must be a string: {}".format(name))
if desc is not None and not isstr(desc):
raise ValueError("desc must be a string: {}".format(desc))
if not (isinstance(source, list) and all([isstr(x) for x in source])):
raise ValueError("source must be a string or a list of strings: {}".format(source))
if not (isinstance(target, list) and all([isstr(x) for x in target])):
raise ValueError("target must be a string or a list of strings: {}".format(target))
if not (isinstance(rule, list) and all([isstr(x) for x in rule])):
raise ValueError("rule must be a string or a list of strings: {}".format(rule))
if not (isinstance(depend, list) and all([isstr(x) for x in depend])):
raise ValueError("depend must be a string or a list of strings: {}".format(depend))
if not (isinstance(require, list) and all([isstr(x) for x in require])):
raise ValueError("require must be a string or a list of strings: {}".format(require))
if not isinstance(resource, dict):
raise ValueError("resource must be dict")
self.name = name
self.desc = desc
self.source = source
self.target = target
self.rule = rule
self.depend = depend
self.require = require
self.resource = resource
self.always = always # force all commands to be executed
self.no_timestamp = no_timestamp # run commands only when targets do not exist (do not check timestamp)
self.ignore_same_task = ignore_same_task
self.ignore_error = ignore_error
self.no_exec = no_exec
self.phony = phony
pass
def __repr__(self):
return self.__str__()
def __str__(self):
return u'Task {}: rule="{}" source=[{}] target=[{}] depend=[{}] require=[{}]'.format(self.name, '; '.join(self.rule), ','.join(self.source), ','.join(self.target), ','.join(self.depend), ','.join(self.require))
def resource_satisfied(self, available_resource):
"""Check whether all required resources are satisfied"""
assert(isinstance(available_resource, dict))
for name, value in iterdict(self.resource):
if name not in available_resource:
return False
if isinstance(value, int) or isinstance(value, float):
if not value <= available_resource[name]:
return False
elif isstr(value):
if value != available_resource[name]:
return False
else:
raise ValueError("Resource value must be int, float, or string: %s", value)
return True
def show_rule(self):
return '; '.join(self.rule)
def show_task(self):
depend = " depend: {}\n".format(' '.join(self.depend)) if len(self.depend) != 0 else ""
require = " require: {}\n".format(' '.join(self.require)) if len(self.require) != 0 else ""
options = [x[1] for x in zip([self.always, self.no_timestamp, self.ignore_same_task, self.ignore_error, self.no_exec, self.phony], ["always", "no_timestamp", "ignore_same_task", "ignore_error", "no_exec", "phony"]) if x[0]]
options_str = " options: {}\n".format(', '.join(options)) if len(options) > 0 else ""
description = " description: {}\n".format(self.desc) if self.desc is not None else ""
return """Task: {}
source: {}
target: {}
rule: {}
{}{}{}{}""".format(self.name, ' '.join(self.source), ' '.join(self.target), '; '.join(self.rule), depend, require, options_str, description)
######################################################################
class TaskGraph:
"""Construct a graph of dependencies from task definitions"""
def __init__(self, task_list, targets=None, always=False, no_timestamp=False, all_tasks=False):
if not isinstance(task_list, list) or not all([isinstance(t, Task) for t in task_list]):
raise ValueError("task_list must be list of Task")
logger.debug('TaskGraph.task_list: %s', task_list)
self.task_list = task_list
# task dependency graph
self.goal_targets = targets
self.always = always # run tasks including up-to-date tasks
self.no_timestamp = no_timestamp # run tasks only when targets do not exist (do not check timestamp)
self.all_tasks = all_tasks # run all tasks in the task graph (except those with no_exec)
self.prev_tasks = None # task -> previous tasks
self.next_tasks = None # task -> next tasks
self.initial_tasks = None # tasks to begin with
self.executed_tasks = None # tasks to be executed, including up-to-date tasks
self.outdated_tasks = None # outdated tasks -> needs to be executed
self.__make_dependencies()
pass
def __make_prev_next_tasks(self):
"""Set prev_tasks and next_tasks by following source/target"""
logger.debug('TaskGraph.__make_prev_next_tasks()')
prev_tasks = {} # file path -> tasks that build this file
next_tasks = {} # file path -> tasks that require this file
# compute prev/next tasks for each file
for task_id, task in enumerate(self.task_list):
# use realpath to process same files with different paths
sources = [os.path.realpath(source) for source in task.source + task.require]
targets = [os.path.realpath(target) for target in task.target]
# confirm sources and targets are disjoint
if len(set(sources) & set(targets)) != 0:
raise ValueError('Source and target files include the same file: {}'.format(task))
# collect prev/next tasks of each file
for source in sources:
next_tasks[source] = next_tasks.get(source, []) + [task_id]
for target in targets:
prev_tasks[target] = prev_tasks.get(target, []) + [task_id]
# confirm that all target files are built by a single task
for target_path, task_ids in prev_tasks.items():
if len(task_ids) > 1:
raise ValueError('Found multiple tasks with the same target "{}": {}'.format(target_path, '; '.join(['"{}"'.format(self.task_list[task_id].name) for task_id in task_ids])))
# compute prev/next tasks for each task
self.prev_tasks = [[] for _ in range(self.num_tasks())]
self.next_tasks = [[] for _ in range(self.num_tasks())]
for task_id, task in enumerate(self.task_list):
for source in task.source + task.require:
source_path = os.path.realpath(source)
self.prev_tasks[task_id].extend(prev_tasks.get(source_path, []))
for target in task.target:
target_path = os.path.realpath(target)
self.next_tasks[task_id].extend(next_tasks.get(target_path, []))
self.prev_tasks = [list(set(l)) for l in self.prev_tasks]
self.next_tasks = [list(set(l)) for l in self.next_tasks]
logger.debug('prev_tasks: %s', self.prev_tasks)
logger.debug('next_tasks: %s', self.next_tasks)
pass
def __check_loops(self):
"""Check loops in the task graph"""
logger.debug('TaskGraph.__check_loops()')
# traverse all tasks from initial tasks
task_stack = [task_id for task_id, _ in enumerate(self.task_list) if len(self.prev_tasks[task_id]) == 0]
visited = [0] * len(self.task_list) # 0 -> not visited yet, 1 -> visiting, 2 -> already done
while len(task_stack) > 0:
task_id = task_stack.pop()
if visited[task_id] == 1:
# finish traversing all descendents
visited[task_id] = 2
continue
visited[task_id] = 1 # visiting all descendents of this node
task_stack.append(task_id) # will be popped when all descendents are visited
for next_task in self.next_tasks[task_id][::-1]:
if visited[next_task] == 1:
# loop detected
start_of_the_loop = task_stack.index(next_task)
loop = task_stack[start_of_the_loop:] + [next_task]
raise ValueError('Found loop in task dependencies: {}'.format(' -> '.join(['"{}"'.format(self.task_list[t].name) for t in loop])))
if visited[next_task] == 2:
# already visited; not added to the stack
continue
task_stack.append(next_task)
# tasks not visited must make a loop
unvisited_tasks = [task_id for task_id, _ in enumerate(self.task_list) if not visited[task_id]]
if len(unvisited_tasks) > 0:
logger.debug('TaskGraph.__check_loops(): loop(s) detected in the task graph')
# at least one loop exists. find a sequence of tasks to make the loop
task_stack = [unvisited_tasks[0]]
loop_tasks = []
while len(task_stack) > 0:
task_id = task_stack.pop()
loop_tasks.append(task_id)
if visited[task_id]: break # end of the loop
visited[task_id] = True
task_stack.extend(self.next_tasks[task_id])
raise ValueError('Found loop in task dependencies: {}'.format(' -> '.join(['"{}"'.format(self.task_list[t].name) for t in loop_tasks])))
logger.debug('TaskGraph.__check_loops(): no loops detected')
return
def __traverse_backwards(self):
"""Traverse tasks from goal targets and obtain previous tasks and tasks to be executed"""
logger.debug('TaskGraph.__traverse_backwards()')
if self.goal_targets is None or len(self.goal_targets) == 0 or self.all_tasks:
# all tasks (except no_exec) will be run
logger.debug('TaskGraph.__traverse_backwards(): targets are not specified. all targets will be run')
self.executed_tasks = [task_id for task_id, task in enumerate(self.task_list) if not task.no_exec]
return
# traverse dependencies from goal targets
logger.debug('TaskGraph.__traverse_backwards(): traverse dependencies from goal targets: %s', self.goal_targets)
target_tasks = set()
for target in self.goal_targets:
tasks = [task_id for task_id, task in enumerate(self.task_list) if any([fnmatch(t, target) for t in task.target]) ]
if len(tasks) == 0:
raise ValueError('Target not found in task definitions: ' + target)
target_tasks |= set(tasks)
prev_tasks = target_tasks
while True:
new_tasks = set(sum([self.prev_tasks[task_id] for task_id in prev_tasks], []))
if len(new_tasks) == 0: break
target_tasks |= new_tasks
prev_tasks = new_tasks
self.executed_tasks = target_tasks
logger.debug('TaskGraph.__traverse_backwards(): executed_tasks: %s', list(self.executed_tasks))
pass
def __set_initial_tasks(self):
"""Obtain initial tasks to begin with"""
logger.debug('TaskGraph.__set_initial_tasks()')
# initial task = tasks with empty prev_task
self.initial_tasks = { task_id for task_id in self.executed_tasks if len(self.prev_tasks[task_id]) == 0 }
logger.debug('TaskGraph.__set_initial_tasks(): initial_tasks: %s', self.initial_tasks)
pass
def __up_to_date(self, task):
# always run the task
if self.always or task.always or task.phony: return False
# no targets -> always up-to-date
if len(task.target) == 0: return True
# no source -> up-to-date if all targets exists
if len(task.source) == 0 and len(task.depend) == 0 and len(task.require) == 0:
return all([os.path.exists(target) for target in task.target])
# some source does not exist -> not up-to-date
if not all([os.path.exists(f) for f in task.source + task.depend + task.require]):
return False
# some target does not exist -> not up-to-date
if not all([os.path.exists(f) for f in task.target]):
return False
# do not check timestamp
if self.no_timestamp or task.no_timestamp:
return True
# check timestamp to judge
source_timestamps = [os.stat(f).st_mtime for f in task.source + task.depend]
target_timestamps = [os.stat(f).st_mtime for f in task.target]
return max(source_timestamps) <= min(target_timestamps)
def __check_outdated_tasks(self):
"""Check outdated tasks"""
# CAUTION: loops in the task graph cause an infinite loop
logger.debug('TaskGraph.__check_outdated_tasks()')
outdated_tasks = set()
for task_id, task in enumerate(self.task_list):
# task is added to the outdated_tasks if targets are not up-to-date
if not self.__up_to_date(task):
outdated_tasks.add(task_id)
prev_outdated_tasks = outdated_tasks
while True:
new_outdated_tasks = set(sum([self.next_tasks[task_id] for task_id in prev_outdated_tasks], []))
if len(new_outdated_tasks) == 0: break
outdated_tasks |= new_outdated_tasks
prev_outdated_tasks = new_outdated_tasks
self.outdated_tasks = outdated_tasks
logger.debug('TaskGraph.__check_outdated_tasks(): outdated_tasks: %s', self.outdated_tasks)
pass
def __make_dependencies(self):
"""Make task dependency graph from task list and goal targets"""
logger.debug('TaskGraph.make_dependencies()')
# compute previous/next tasks
self.__make_prev_next_tasks()
# check loops in the task graph
self.__check_loops()
# obtain tasks that must be executed to build targets
self.__traverse_backwards()
# set initial tasks
self.__set_initial_tasks()
# check up-to-date tasks
self.__check_outdated_tasks()
pass
def num_tasks(self):
return len(self.task_list)
def num_executed_tasks(self):
return len(self.executed_tasks)
def num_outdated_tasks(self):
return len(self.outdated_tasks)
def num_active_tasks(self):
return len(set(self.executed_tasks) & set(self.outdated_tasks))
def get_task(self, task_id):
return self.task_list[task_id]
def is_executed(self, task_id):
return task_id in self.executed_tasks
def is_outdated(self, task_id):
return task_id in self.outdated_tasks
def is_active(self, task_id):
return self.is_executed(task_id) and self.is_outdated(task_id)
def check_missing_sources(self):
"""Check whether input files exist"""
logger.debug('TaskGraph.check_missing_sources(): Collect sources that are not generated by any tasks')
all_sources = set([os.path.realpath(f) for f in sum([self.get_task(task_id).source + self.get_task(task_id).require for task_id in self.executed_tasks], [])])
all_targets = set([os.path.realpath(f) for f in sum([self.get_task(task_id).target for task_id in self.executed_tasks], [])])
sources = all_sources - all_targets
logger.debug('TaskGraph.check_missing_sources(): check existence: %s', sources)
missing_sources = []
for source in sources:
if not os.path.exists(source):
missing_sources.append(source)
logger.debug('TaskGraph.check_missing_sources(): missing sources: %s', missing_sources)
return missing_sources
def draw_dependencies(self):
"""Output a task graph in dot format"""
out = StringIO()
out.write('digraph dependencies {\n')
out.write(' rankdir = LR;')
out.write(' node [shape = box];\n')
for task_id, task in enumerate(self.task_list):
if self.is_active(task_id):
style = 'solid'
else:
style = 'dashed'
#out.write(' t{} [label="{}" style={}];\n'.format(task_id, ';'.join(task.rule), style))
name = task.name.replace('\n', '\\n').replace('"', '\\"')
out.write(' t{} [label="{}" style={}];\n'.format(task_id, name, style))
prev_tasks = self.prev_tasks[task_id]
for prev_task in prev_tasks:
out.write(' t{} -> t{};\n'.format(prev_task, task_id))
out.write('}\n')
return out.getvalue().encode()
######################################################################
## parallel processing functions
class TaskTerminatedException(Exception):
def __init__(self):
pass
def __str__(self):
return 'Task terminated by SIGTERM'
def sigterm_handler(num, frame):
raise TaskTerminatedException
class Worker(multiprocessing.Process):
"""Get a task from the queue and execute it in multiprocessing
If it receives 'None', the worker terminates."""
def __init__(self, worker_id, input_queue, output_queue, env=None):
multiprocessing.Process.__init__(self)
self.worker_id = worker_id
self.input_queue = input_queue
self.output_queue = output_queue
if not (env is None or isinstance(env, dict)):
raise ValueError('env must be a dict')
self.env = env
pass
def run(self):
os.setsid() # disconnect from tty
os.dup2(os.open(os.devnull, os.O_RDONLY), sys.stdin.fileno()) # disconnect stdin
try:
signal.signal(signal.SIGINT, signal.SIG_IGN) # ignore keyboard interrupt. this is handled by main process
signal.signal(signal.SIGTERM, sigterm_handler) # raises exception for SIGTERM
if self.env is not None:
os.environ.update(self.env) # add environment
while True:
logger.debug('Worker %s waiting for task', self.name)
(task_id, func) = self.input_queue.get()
logger.debug('Worker %s got task', self.name)
if func is None:
logger.debug('Worker %s terminates', self.name)
break # terminate this process
logger.debug('Worker %s runs task %s', self.name, task_id)
try:
# execute the task
ret = func()
except TaskTerminatedException as e:
# put the result as failure (to clean up the task execution)
logger.debug('Worker %s terminates task %s due to SIGTERM', self.name, task_id)
self.output_queue.put((self.worker_id, task_id, 1))
raise e
except Exception as e:
# unexpected error raised
logger.error(coloring('red', 'Error in running task %s: %s'), task_id, sys.exc_info()[1])
self.output_queue.put((self.worker_id, task_id, 1))
continue
logger.debug('Worker %s finished task %s', self.name, task_id)
self.output_queue.put((self.worker_id, task_id, ret))
logger.debug('Worker %s has put the result of task %s', self.name, task_id)
except TaskTerminatedException:
# terminate the process
logger.debug('Worker %s killing child processes', self.name)
children = psutil.Process(self.pid).children(recursive=True)
for child in children:
child.terminate()
logger.debug('Worker %s stops due to SIGTERM', self.name)
return
class ExecCommand:
def __init__(self, task_no, task, is_dry_run=False, touch=False, up_to_date=False, ignore_error=False):
self.task_no = task_no
self.task = task
self.is_dry_run = is_dry_run # do not run commands
self.touch = touch # run `touch` rather than executing commands
self.up_to_date = up_to_date # whether the targets are up-to-date
self.ignore_error = ignore_error # ignore errors of commands
pass
def exec_touch(self, targets):
for target in targets:
try:
os.utime(target, None)
except:
open(target, 'a').close()
return 0
def exec_command(self, rule):
assert(isstr(rule))
#return subprocess.call(rule, shell=True, close_fds=True, stdin=open(os.devnull), preexec_fn=os.setsid())
#return subprocess.call(rule, shell=True, close_fds=True)
signal.signal(signal.SIGINT, signal.SIG_IGN) # ignore keyboard interrupt. this is handled by main process
signal.signal(signal.SIGTERM, sigterm_handler) # raises exception for SIGTERM
try:
logger.debug('ExecCommand:exec_command: %s', rule)
proc = subprocess.Popen(rule, shell=True, close_fds=True, preexec_fn=os.setpgrp)
ret = proc.wait()
logger.debug('finished ExecCommand:exec_command with returncode=%s: %s', ret, rule)
except OSError as e:
# command cannot be executed
logger.error(coloring('red', 'Command could not be executed: %s'), rule)
return 1
except TaskTerminatedException as e:
logger.debug('ExecCommand terminates subprocess: %s', rule)
try:
os.killpg(proc.pid, signal.SIGTERM)
except Exception as e:
pass
return 1
except Exception as e:
logger.debug('ExecCommand: Unknown error raised')
try:
os.killpg(proc.pid, signal.SIGTERM)
except Exception as e:
pass
return 1
return ret
def __call__(self):
try:
if self.up_to_date:
# does not execute the command because targets are up-to-date
logger.info(coloring('blue', '%s [%s] targets up-to-date: ') + '%s', self.task_no, self.task.name, ', '.join(self.task.target))
return 0
elif len(self.task.rule) == 0:
# no rule -> show "done" message
logger.info(coloring('yellow', '%s [%s] done'), self.task_no, self.task.name)
return 0
elif self.is_dry_run:
# dry-run mode: does not execute the command
logger.info(coloring('green', '%s [%s] start: ') + '%s', self.task_no, self.task.name, self.task.show_rule())
return 0
elif self.touch and not self.task.phony:
logger.info(coloring('green', '%s [%s] start: ') + '%s', self.task_no, self.task.name, self.task.show_rule())
ret = self.exec_touch(self.task.target)
return ret
else:
logger.info(coloring('green', '%s [%s] start: ') + '%s', self.task_no, self.task.name, self.task.show_rule())
for rule in self.task.rule:
ret = self.exec_command(rule)
if ret != 0 and not self.ignore_error:
logger.error(coloring('red', '***** %s [%s] failed (status=%s) *****: ') + '%s', self.task_no, self.task.name, ret, self.task.show_rule())
return ret
logger.info(coloring('yellow', '%s [%s] done: ') + '%s', self.task_no, self.task.name, self.task.show_rule())
return 0
except TaskTerminatedException as e:
logger.debug('ExecCommand:__call__ terminated')
return 1
pass
class Scheduler:
def __init__(self, task_graph, dry_run=False, touch=False, keep_going=False, terminate_on_error=True, ignore_errors=False, ignore_missing_sources=False, num_jobs=1, environments=None, resources=None):
if not isinstance(task_graph, TaskGraph):
raise ValueError("task_graph must be an instance of TaskGraph")
if environments is None:
environments = [dict() for _ in range(num_jobs)]
if not (isinstance(environments, list) and all([isinstance(x, dict) for x in environments])):
raise ValueError("environments must be list of dict")
if len(environments) != num_jobs:
raise ValueError("length of environments must be equal to num_jobs")
if resources is None:
resources = [dict() for _ in range(num_jobs)]
if not (isinstance(resources, list) and all([isinstance(x, dict) for x in resources])):
raise ValueError("resources must be list of dict")
if len(resources) != num_jobs:
raise ValueError("length of resources must be equal to num_jobs")
self.task_graph = task_graph
self.dry_run = dry_run
self.touch = touch
self.keep_going = keep_going
self.terminate_on_error = terminate_on_error
self.ignore_errors = ignore_errors
self.ignore_missing_sources = ignore_missing_sources
self.num_jobs = num_jobs
self.environments = environments
self.resources = resources
self.num_queued_tasks = 0 # Number of tasks added to the task queue
self.num_succeeded_tasks = 0 # Number of tasks finished successfully so far
self.num_failed_tasks = 0 # NUmber of tasks failed so far
pass
def __add_task(self, task_id, done_tasks, task_queue):
"""Add a new task on the task queue
If the specified task is not outdated, the task is not executed and complete_task is called directly."""
logger.debug('Scheduler.__add_task()')
assert(self.task_graph.is_executed(task_id)) # task_id must be executed_task
self.num_queued_tasks += 1
task_no = '({}/{})'.format(self.num_queued_tasks, self.task_graph.num_executed_tasks())
logger.debug('Scheduler.__add_task(): put task %s to the queue', task_id)
command = ExecCommand(task_no, self.task_graph.get_task(task_id), is_dry_run=self.dry_run, touch=self.touch, up_to_date=not self.task_graph.is_outdated(task_id), ignore_error=self.ignore_errors or self.task_graph.get_task(task_id).ignore_error)
task_queue.append((task_id, command))
logger.debug('Scheduler.__add_task() done. task queue size: %s', len(task_queue))
pass
def __complete_task(self, task_id, done_tasks, task_queue, add_next_tasks):
"""Complete the task and add the next tasks to the task queue"""
logger.debug('Scheduler.__complete_task()')
assert(self.task_graph.is_executed(task_id)) # task_id must be executed_task
done_tasks.add(task_id)
self.num_succeeded_tasks += 1
logger.debug('%s tasks have been finished so far', len(done_tasks))
if not add_next_tasks:
return # do not add next tasks any more (maybe some task failed already)
next_tasks = [tid for tid in self.task_graph.next_tasks[task_id] if self.task_graph.is_executed(tid)]
logger.debug('After task %s, try adding the next %s tasks', task_id, len(next_tasks))
tasks_to_add = []
for next_task in next_tasks:
prev_tasks = self.task_graph.prev_tasks[next_task]
if all([t in done_tasks for t in prev_tasks]):
logger.debug('Adding task %s after task %s since all the previous tasks done', next_task, task_id)
tasks_to_add.append(next_task)
for tid in tasks_to_add:
self.__add_task(tid, done_tasks, task_queue)
logger.debug('Scheduler.__complete_task() done')
pass
def __process_failed_task(self, task_id, ret):
"""Post-process failed task; show error message and remove target files"""
logger.debug('Scheduler received task %s failure', task_id)
self.num_failed_tasks += 1
task = self.task_graph.get_task(task_id)
logger.debug('Scheduler removes targets of task %s', task_id)
self.__remove_targets(task_id)
pass
def __remove_targets(self, task_id):
"""Remove updated targets for the specified task
Should be called when the task is failed"""
# TODO: should remove only updated targets
# at the moment, all targets are removed
targets = self.task_graph.get_task(task_id).target
if len(targets) > 0:
logger.info(coloring('red', 'Removing targets: ') + '%s', ' '.join(targets))
rename_suffix = '.failed-{}'.format(datetime.now().strftime("%Y%m%d%H%M%S"))
for target in targets:
if os.path.exists(target):
renamed = target + rename_suffix
os.rename(target, renamed)
logger.debug('Rename %s -> %s', target, renamed)
pass
def __assign_tasks(self, task_queue, worker_queue, input_queues):
pending_tasks = []
while len(worker_queue) > 0 and len(task_queue) > 0:
task = task_queue.popleft()
worker_id = None
for i, id in enumerate(worker_queue):
if self.task_graph.get_task(task[0]).resource_satisfied(self.resources[id]):
worker_index = i
worker_id = id
break
if worker_id is None:
# no worker satisfies resource requirement. suspend this task.
pending_tasks.append(task)
continue
# assign the task to worker_id
worker_queue.pop(worker_index)
logger.debug('Scheduler: assign task %s to worker %s', task[0], worker_id)
input_queues[worker_id].put(task)
# put back suspended tasks
task_queue.extend(pending_tasks)
def run(self):
"""Run tasks"""
logger.debug('Scheduler.run()')
missing_sources = self.task_graph.check_missing_sources()
if len(missing_sources) > 0:
for s in sorted(missing_sources):
logger.error(coloring('red', 'Source not found: ') + '%s', s)
if not self.ignore_missing_sources:
self.num_failed_tasks = len(missing_sources)
return # Cannot run the tasks because some sources not found
initial_tasks = self.task_graph.initial_tasks
logger.debug('initial tasks: %s', initial_tasks)
done_tasks = set()
result_queue = multiprocessing.Queue()
logger.debug('Scheduler creating %s workers', self.num_jobs)
input_queues = [multiprocessing.Queue() for i in range(self.num_jobs)]
worker_pool = [Worker(i, input_queues[i], result_queue, self.environments[i]) for i in range(self.num_jobs)]
#worker_queue = deque(range(self.num_jobs))
worker_queue = list(range(self.num_jobs))
logger.debug('Scheduler adding %s initial tasks', len(initial_tasks))
task_queue = deque()
for task_id in initial_tasks:
self.__add_task(task_id, done_tasks, task_queue)
logger.debug('Scheduler starting %s workers', len(worker_pool))
for worker in worker_pool:
worker.start()
try:
signal.signal(signal.SIGTERM, sigterm_handler)
add_next_tasks = True
# loop while the task is remaining or some workers are working
while len(task_queue) > 0 or len(worker_queue) < len(worker_pool):
# assign tasks in the queue to workers as far as possible
logger.debug('Scheduler: assign tasks to workers: %s workers, %s tasks', len(worker_queue), len(task_queue))
self.__assign_tasks(task_queue, worker_queue, input_queues)
# retrieve task results and complete tasks
worker_id, task_id, ret = result_queue.get()
logger.debug('Scheduler recieved task %s result code %s from worker %s', task_id, ret, worker_id)
if ret == 0 or self.ignore_errors or self.task_graph.get_task(task_id).ignore_error:
# task succeeded
logger.debug('Scheduler: task %s finished successfully. complete this task', task_id)
self.__complete_task(task_id, done_tasks, task_queue, add_next_tasks)
else:
# task failed
logger.debug('Scheduler: task %s failed.', task_id)
self.__process_failed_task(task_id, ret)
if self.keep_going:
# continue other tasks as far as possible
logger.debug('Scheduler: continue scheduling.')
pass
elif self.terminate_on_error:
# send SIGTERM to terminate other processes
logger.debug('Scheduler: raise exception to terminate tasks')
raise TaskTerminatedException
else:
# stop scheduling, but waiting for other processes to finish
logger.debug('Scheduler: stop scheduling but wait until other running tasks finish')
logger.debug('task_queue has %s items. removing them.', len(task_queue))
task_queue.clear()
if self.num_jobs > 1:
logger.info(coloring('red', 'Waiting for running tasks to finish...'))
add_next_tasks = False # do not add next tasks any more
pass
logger.debug('Scheduler: worker %s is now available and added to the queue', worker_id)
worker_queue.append(worker_id)
logger.debug('Scheduler: done collecting results')
except (TaskTerminatedException, KeyboardInterrupt) as e:
# the scheduling is quit due to task failure, SIGTERM, or SIGINT
logger.info(coloring('red', 'Terminating running tasks...'))
logger.debug('Scheduler is stopping due to task failure, SIGTERM, or SIGINT')
logger.debug('task_queue has %s items. removing them.', len(task_queue))
task_queue.clear()
logger.debug('Scheduler: terminate %s workers', len(worker_pool))
for worker in worker_pool:
worker.terminate() # worker will terminate the current task with exit code 1
logger.debug('Scheduler: all tasks finished. Sending poison pill to the workers')
for input_queue in input_queues:
input_queue.put((0, None)) # poison pill
logger.debug('Scheduler waiting for %s workers to terminate.', len(worker_pool))
for worker in worker_pool:
worker.join()
logger.debug('Scheduler collecting remaining results from workers')
while not result_queue.empty():
# process finished tasks
worker_id, task_id, ret = result_queue.get()
if ret == 0 or self.ignore_errors or self.task_graph.get_task(task_id).ignore_error:
# task succeeded
logger.debug('Scheduler: task %s finished successfully. complete this task', task_id)
self.__complete_task(task_id, done_tasks, task_queue, add_next_tasks=False)
else:
# task failed
logger.debug('Scheduler: task %s failed.', task_id)
self.__process_failed_task(task_id, ret)
logger.debug('Scheduler.run() done')
pass
def task_failed(self):
return self.num_failed_tasks > 0
######################################################################
class Workflow:
def set_options(self, num_jobs = None,
dry_run = None,
touch = None,
list_tasks = None,
dependency_graph = None,
keep_going = None,
terminate_on_error = None,
ignore_errors = None,
ignore_missing_sources = None,
all_tasks = None,
always = None,
no_timestamp = None,
debug_level = None,
environment = None,
environments_distributed = None,
resources = None,
goal_targets = None):
if num_jobs is not None:
if not (isinstance(num_jobs, int) and num_jobs >= 1): raise ValueError("num_jobs must be positive int")
self.num_jobs = num_jobs
if dry_run is not None:
if not isinstance(dry_run, bool): raise ValueError("dry_run must be bool")
self.dry_run = dry_run
if touch is not None:
if not isinstance(touch, bool): raise ValueError("touch must be bool")
self.touch = touch
if list_tasks is not None:
if not isinstance(list_tasks, bool): raise ValueError("list_tasks must be bool")
self.list_tasks = list_tasks
if dependency_graph is not None:
if not isstr(dependency_graph): raise ValueError("dependency_graph must be string")
self.dependency_graph = dependency_graph
if keep_going is not None:
if not isinstance(keep_going, bool): raise ValueError("keep_going must be bool")
self.keep_going = keep_going
if terminate_on_error is not None:
if not isinstance(terminate_on_error, bool): raise ValueError("terminate_on_error must be bool")
self.terminate_on_error = terminate_on_error
if ignore_errors is not None:
if not isinstance(ignore_errors, bool): raise ValueError("ignore_errors must be bool")
self.ignore_errors = ignore_errors
if ignore_missing_sources is not None:
if not isinstance(ignore_missing_sources, bool): raise ValueError("ignore_missing_sources must be bool")
self.ignore_missing_sources = ignore_missing_sources
if all_tasks is not None:
if not isinstance(all_tasks, bool): raise ValueError("all_tasks must be bool")
self.all_tasks = all_tasks
if always is not None:
if not isinstance(always, bool): raise ValueError("always must be bool")
self.always = always
if no_timestamp is not None:
if not isinstance(no_timestamp, bool): raise ValueError("no_timestamp must be bool")
self.no_timestamp = no_timestamp
if debug_level is not None:
if debug_level not in [logging.DEBUG, logging.INFO, logging.ERROR]:
raise ValueError("debug_level must be either of logging.DEBUG, logging.INFO, logging.ERROR")
self.debug_level = debug_level
if environment is not None:
if not isinstance(environment, dict): raise ValueError("environment must be dict")
self.environment = environment
if environments_distributed is not None:
if not (isinstance(environments_distributed, list) and all([isinstance(e, dict) for e in environments_distributed])):
raise ValueError("environments_distributed must be list of dict")
self.environments_distributed = environments_distributed
if resources is not None:
if not (isinstance(resources, list) and all([isinstance(r, dict) for r in resources])):
raise ValueError("resources must be list of dict")
self.resources = resources
if goal_targets is not None:
if not (isinstance(goal_targets, list) and all([isstr(x) for x in goal_targets])):
raise ValueError("goal_targets must be list of strings")
self.goal_targets = goal_targets
pass
def init_options(self,
num_jobs = 1,
dry_run = False,
touch = False,
list_tasks = False,
dependency_graph = None,
keep_going = False,
terminate_on_error = True,
ignore_errors = False,
ignore_missing_sources = False,
all_tasks = False,
always = False,
no_timestamp = False,
debug_level = logging.INFO,
environment = None,
environments_distributed = None,
resources = None,
goal_targets = []):
self.dependency_graph = None
self.environment = None
self.environments_distributed = None
self.resources = None
self.set_options(num_jobs = num_jobs,
dry_run = dry_run,
touch = touch,
list_tasks = list_tasks,
dependency_graph = dependency_graph,
keep_going = keep_going,
terminate_on_error = terminate_on_error,
ignore_errors = ignore_errors,
ignore_missing_sources = ignore_missing_sources,
all_tasks = all_tasks,
always = always,
no_timestamp = no_timestamp,
debug_level = debug_level,
environment = environment,
environments_distributed = environments_distributed,
resources = resources,
goal_targets = goal_targets)
pass
def show_options(self):
bools = ','.join([x[1] for x in
zip([self.dry_run, self.touch, self.list_tasks, self.keep_going, self.terminate_on_error, self.ignore_errors, self.ignore_missing_sources, self.all_tasks, self.always, self.no_timestamp],
["dry_run", "touch", "list_tasks", "keep_going", "terminate_on_error", "ignore_errors", "ignore_missing_sources", "all_tasks", "always", "no_timestamp"])