1 : |
agomez |
1 |
#!/usr/bin/python
|
2 : |
|
|
import sys
|
3 : |
|
|
import time
|
4 : |
|
|
|
5 : |
|
|
from logger import log,configure
|
6 : |
|
|
|
7 : |
|
|
UPDATER_PERIOD = 30
|
8 : |
|
|
|
9 : |
|
|
if len(sys.argv) >1:
|
10 : |
|
|
configure(logfile = sys.argv[1], debug=True, console=False)
|
11 : |
|
|
else:
|
12 : |
|
|
configure(debug=True, console=True)
|
13 : |
|
|
|
14 : |
|
|
try:
|
15 : |
|
|
import bonfire
|
16 : |
|
|
import ogs
|
17 : |
|
|
import hostsfile
|
18 : |
|
|
|
19 : |
|
|
default_bonfire = "/etc/default/bonfire"
|
20 : |
|
|
bonfire.read_defaults(default_bonfire)
|
21 : |
|
|
hosts_file = "/etc/hosts"
|
22 : |
|
|
|
23 : |
|
|
log.info("Bonfire URI: %s" % bonfire.uri)
|
24 : |
|
|
log.info("Credentials: %s:%s" % (bonfire.user,bonfire.password,))
|
25 : |
|
|
log.info("Experiment: %s" % bonfire.experiment_id)
|
26 : |
|
|
|
27 : |
|
|
log.debug("Set OGS all.q queue to use /bin/bash")
|
28 : |
|
|
while(True):
|
29 : |
|
|
try:
|
30 : |
|
|
ogs.modify_queue("all.q",{"shell":"/bin/bash"})
|
31 : |
|
|
ogs.modify_queue("all.q",{"rerun":"TRUE"})
|
32 : |
|
|
ogs.modify_global({"max_unheard":"00:02:00","reschedule_unknown":"00:01:00"})
|
33 : |
|
|
break
|
34 : |
|
|
except Exception as excpt:
|
35 : |
|
|
log.exception(excpt)
|
36 : |
|
|
time.sleep(10)
|
37 : |
|
|
|
38 : |
|
|
while(True):
|
39 : |
|
|
try:
|
40 : |
|
|
log.info("Get experiment")
|
41 : |
|
|
experiment = bonfire.get_experiment(bonfire.experiment_id)
|
42 : |
|
|
|
43 : |
|
|
try:
|
44 : |
|
|
#Update /etc/hosts
|
45 : |
|
|
log.info("Hosts file")
|
46 : |
|
|
hosts = hostsfile.load(hosts_file)
|
47 : |
|
|
new_hosts = [(compute['ip'],compute['hostname']) for compute in experiment['computes'] if "client" in compute['hostname'] and not "DONE" in compute['state'] and not "FAILED" in compute ['state'] and not hosts.has_key(compute['ip'])]
|
48 : |
|
|
log.debug(new_hosts)
|
49 : |
|
|
for ip,hostname in new_hosts:
|
50 : |
|
|
#Add to hosts list file
|
51 : |
|
|
hostsfile.append(hosts_file,ip,hostname)
|
52 : |
|
|
except Exception as excpt:
|
53 : |
|
|
log.exception(excpt)
|
54 : |
|
|
|
55 : |
|
|
try:
|
56 : |
|
|
#Update OGS
|
57 : |
|
|
log.info("Update OGS")
|
58 : |
|
|
hosts = ogs.get_execution_hosts()
|
59 : |
|
|
new_hosts = [(compute['ip'],compute['hostname'],float(compute['cpu'])) for compute in experiment['computes'] if "client" in compute['hostname'] and not "DONE" in compute ['state'] and not "FAILED" in compute ['state'] and not compute['hostname'] in hosts]
|
60 : |
|
|
log.debug(new_hosts)
|
61 : |
|
|
for ip,hostname,cpu in new_hosts:
|
62 : |
|
|
#Add to OGS
|
63 : |
|
|
ogs.new_host(hostname,cpu)
|
64 : |
|
|
except Exception as excpt:
|
65 : |
|
|
log.exception(excpt)
|
66 : |
|
|
|
67 : |
|
|
try:
|
68 : |
|
|
#Remove invalid hosts
|
69 : |
|
|
log.info("Remove invalid hosts")
|
70 : |
|
|
done_hostnames = [compute['hostname'] for compute in experiment['computes'] if "client" in compute['hostname'] and (("DONE" in compute['state']) or ("FAILED" in compute['state']))]
|
71 : |
|
|
log.debug(done_hostnames)
|
72 : |
|
|
for hostname in done_hostnames:
|
73 : |
|
|
#Remove from OGS
|
74 : |
|
|
ogs.remove_host(hostname)
|
75 : |
|
|
except Exception as excpt:
|
76 : |
|
|
log.exception(excpt)
|
77 : |
|
|
except Exception as excpt:
|
78 : |
|
|
log.exception(excpt)
|
79 : |
|
|
time.sleep(UPDATER_PERIOD)
|
80 : |
|
|
except Exception as excpt:
|
81 : |
|
|
log.exception(excpt)
|