1 : |
agomez |
1 |
#!/usr/bin/python
|
2 : |
|
|
import os
|
3 : |
|
|
import os.path
|
4 : |
|
|
import subprocess
|
5 : |
|
|
import sys
|
6 : |
|
|
import time
|
7 : |
|
|
|
8 : |
|
|
from logger import log,configure
|
9 : |
|
|
if len(sys.argv)>1:
|
10 : |
|
|
configure(logfile = sys.argv[1], debug=True, console=False)
|
11 : |
|
|
else:
|
12 : |
|
|
configure(debug=True, console=True)
|
13 : |
|
|
|
14 : |
|
|
try:
|
15 : |
|
|
import bonfire
|
16 : |
|
|
import hostsfile
|
17 : |
|
|
import ogs
|
18 : |
|
|
import vcutil
|
19 : |
|
|
allow_single_cluster = True
|
20 : |
|
|
default_bonfire = "/etc/default/bonfire"
|
21 : |
|
|
bonfire.read_defaults(default_bonfire)
|
22 : |
|
|
hosts_file = "/etc/hosts"
|
23 : |
|
|
log.info("Bonfire URI: %s" % bonfire.uri)
|
24 : |
|
|
log.info("Credentials: %s:%s" % (bonfire.user,bonfire.password,))
|
25 : |
|
|
log.info("Experiment: %s" % bonfire.experiment_id)
|
26 : |
|
|
|
27 : |
|
|
#Basic hostsfile for main nodes
|
28 : |
|
|
master_host = None
|
29 : |
|
|
shadow_host = None
|
30 : |
|
|
log.info("Get experiment")
|
31 : |
|
|
while (master_host == None) and ((not allow_single_cluster) or (shadow_host == None)) :
|
32 : |
|
|
experiment = bonfire.get_experiment(bonfire.experiment_id)
|
33 : |
|
|
for compute in experiment['computes']:
|
34 : |
|
|
if compute["hostname"].startswith("master"):
|
35 : |
|
|
master_host = (compute['ip'],compute['hostname'])
|
36 : |
|
|
elif compute["hostname"].startswith("shadow"):
|
37 : |
|
|
shadow_host = (compute['ip'],compute['hostname'])
|
38 : |
|
|
main_hosts = [('127.0.0.1','localhost'), master_host]
|
39 : |
|
|
if shadow_host:
|
40 : |
|
|
main_hosts.append(shadow_host)
|
41 : |
|
|
log.debug(main_hosts)
|
42 : |
|
|
hostsfile.store(hosts_file,main_hosts)
|
43 : |
|
|
|
44 : |
|
|
#Initialize volume
|
45 : |
|
|
#log.info "Zeroing volume"
|
46 : |
|
|
#log.info vcutil.execute("dd if=/dev/zero of=/dev/xvde bs=1M ",ignore_error=True)
|
47 : |
|
|
#Destroy previous file system
|
48 : |
|
|
# log.info("Destroying file system")
|
49 : |
|
|
# log.info(vcutil.execute("dd if=/dev/zero of=/dev/xvde bs=1M count=10",ignore_error=True))
|
50 : |
|
|
|
51 : |
|
|
if bonfire.hostname.startswith("master"):
|
52 : |
|
|
log.info("I am master node")
|
53 : |
|
|
|
54 : |
|
|
backup_hostname = ""
|
55 : |
|
|
if shadow_host:
|
56 : |
|
|
backup_hostname = shadow_host[1]
|
57 : |
|
|
ogs_conf = """
|
58 : |
|
|
SGE_ROOT=\"/shared/ogs\"
|
59 : |
|
|
SGE_QMASTER_PORT=\"6444\"
|
60 : |
|
|
SGE_EXECD_PORT=\"6445\"
|
61 : |
|
|
SGE_ENABLE_SMF=\"false\"
|
62 : |
|
|
SGE_ENABLE_ST=\"false\"
|
63 : |
|
|
SGE_CLUSTER_NAME=\"virtual\"
|
64 : |
|
|
CELL_NAME=\"default\"
|
65 : |
|
|
ADMIN_USER=\"\"
|
66 : |
|
|
QMASTER_SPOOL_DIR=\"/shared/ogs/default/spool/qmaster\"
|
67 : |
|
|
EXECD_SPOOL_DIR=\"/shared/ogs/default/spool\"
|
68 : |
|
|
GID_RANGE=\"20000-20100\"
|
69 : |
|
|
SPOOLING_METHOD=\"classic\"
|
70 : |
|
|
DB_SPOOLING_SERVER=\"none\"
|
71 : |
|
|
PAR_EXECD_INST_COUNT=\"20\"
|
72 : |
|
|
ADMIN_HOST_LIST=\"%s\"
|
73 : |
|
|
SUBMIT_HOST_LIST=\"%s %s\"
|
74 : |
|
|
EXEC_HOST_LIST=\"\"
|
75 : |
|
|
EXECD_SPOOL_DIR_LOCAL=\"\"
|
76 : |
|
|
COPY_COMMAND=\"scp\"
|
77 : |
|
|
DEFAULT_DOMAIN=\"none\"
|
78 : |
|
|
ADMIN_MAIL=\"none\"
|
79 : |
|
|
ADD_TO_RC=\"false\"
|
80 : |
|
|
SET_FILE_PERMS=\"true\"
|
81 : |
|
|
RESCHEDULE_JOBS=\"wait\"
|
82 : |
|
|
SCHEDD_CONF=\"1\"
|
83 : |
|
|
SHADOW_HOST=\"%s\"
|
84 : |
|
|
REMOVE_RC=\"false\"
|
85 : |
|
|
HOSTNAME_RESOLVING=\"true\"
|
86 : |
|
|
""" % (master_host[1], master_host[1], backup_hostname, backup_hostname)
|
87 : |
|
|
|
88 : |
|
|
if shadow_host:
|
89 : |
|
|
drbd_global = """global {
|
90 : |
|
|
usage-count no;
|
91 : |
|
|
}
|
92 : |
|
|
|
93 : |
|
|
common {
|
94 : |
|
|
protocol C;
|
95 : |
|
|
meta-disk internal;
|
96 : |
|
|
|
97 : |
|
|
syncer {
|
98 : |
|
|
rate 10M;
|
99 : |
|
|
}
|
100 : |
|
|
|
101 : |
|
|
disk {
|
102 : |
|
|
on-io-error detach;
|
103 : |
|
|
}
|
104 : |
|
|
}"""
|
105 : |
|
|
|
106 : |
|
|
drbd_res = """resource r0 {
|
107 : |
|
|
net {
|
108 : |
|
|
allow-two-primaries;
|
109 : |
|
|
after-sb-0pri disconnect;
|
110 : |
|
|
after-sb-1pri disconnect;
|
111 : |
|
|
after-sb-2pri disconnect;
|
112 : |
|
|
}
|
113 : |
|
|
|
114 : |
|
|
on %s {
|
115 : |
|
|
device /dev/drbd1;
|
116 : |
|
|
disk /dev/xvde;
|
117 : |
|
|
address %s:7789;
|
118 : |
|
|
meta-disk internal;
|
119 : |
|
|
}
|
120 : |
|
|
on %s {
|
121 : |
|
|
device /dev/drbd1;
|
122 : |
|
|
disk /dev/xvde;
|
123 : |
|
|
address %s:7789;
|
124 : |
|
|
meta-disk internal;
|
125 : |
|
|
}
|
126 : |
|
|
}""" % (master_host[1],master_host[0],shadow_host[1],shadow_host[0])
|
127 : |
|
|
|
128 : |
|
|
ocfs_conf = """cluster:
|
129 : |
|
|
node_count = 2
|
130 : |
|
|
name = ocfs2
|
131 : |
|
|
node:
|
132 : |
|
|
ip_port = 7777
|
133 : |
|
|
ip_address = %s
|
134 : |
|
|
number = 0
|
135 : |
|
|
name = %s
|
136 : |
|
|
cluster = ocfs2
|
137 : |
|
|
node:
|
138 : |
|
|
ip_port = 7777
|
139 : |
|
|
ip_address = %s
|
140 : |
|
|
number = 1
|
141 : |
|
|
name = %s
|
142 : |
|
|
cluster = ocfs2""" % (master_host[0],master_host[1],shadow_host[0],shadow_host[1])
|
143 : |
|
|
|
144 : |
|
|
|
145 : |
|
|
#Install DRBD
|
146 : |
|
|
log.info("Master - Configure DRBD")
|
147 : |
|
|
with open("/etc/drbd.d/global_common.conf",'w') as file:
|
148 : |
|
|
file.write(drbd_global)
|
149 : |
|
|
with open("/etc/drbd.d/r0.res",'w') as file:
|
150 : |
|
|
file.write(drbd_res)
|
151 : |
|
|
log.debug(vcutil.execute("modprobe drbd"))
|
152 : |
|
|
|
153 : |
|
|
log.info("Master - Prepare SSH access to shadow")
|
154 : |
|
|
import paramiko
|
155 : |
|
|
ssh = paramiko.SSHClient()
|
156 : |
|
|
ssh.load_system_host_keys()
|
157 : |
|
|
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
158 : |
|
|
sftp = None
|
159 : |
|
|
while True:
|
160 : |
|
|
try:
|
161 : |
|
|
# ssh.connect(shadow_host[1],key_filename="/root/.ssh/id_rsa")
|
162 : |
|
|
ssh.connect(shadow_host[1],username="root",password="bonfire",timeout=10)
|
163 : |
|
|
sftp = ssh.open_sftp()
|
164 : |
|
|
log.info("Connected to: %s" % str(ssh.exec_command("hostname")[1].read()))
|
165 : |
|
|
break
|
166 : |
|
|
except Exception as err:
|
167 : |
|
|
log.info("Shadow not available: ")
|
168 : |
|
|
log.exception(err)
|
169 : |
|
|
time.sleep(10)
|
170 : |
|
|
|
171 : |
|
|
log.info("Shadow - Configure DRBD")
|
172 : |
|
|
#log.info("Fill with %i blocks of 1M" % dev_size)
|
173 : |
|
|
#log.info(ssh.exec_command("dd if=/dev/zero of=/dev/xvde bs=1M count=%i" % dev_size)[1].read())
|
174 : |
|
|
#log.info(ssh.exec_command("dd if=/dev/zero of=/dev/xvde bs=1M" )[1].read())
|
175 : |
|
|
#log.info("Destroying file system")
|
176 : |
|
|
#log.info(ssh.exec_command("dd if=/dev/zero of=/dev/xvde bs=1M count=10")[1].read())
|
177 : |
|
|
file = sftp.open("/etc/drbd.d/global_common.conf",'w')
|
178 : |
|
|
file.write(drbd_global)
|
179 : |
|
|
file.close()
|
180 : |
|
|
file = sftp.open("/etc/drbd.d/r0.res",'w')
|
181 : |
|
|
file.write(drbd_res)
|
182 : |
|
|
file.close()
|
183 : |
|
|
log.debug(ssh.exec_command("modprobe drbd")[1].read())
|
184 : |
|
|
log.debug(ssh.exec_command("/etc/init.d/drbd restart")[1].read())
|
185 : |
|
|
log.debug(ssh.exec_command("drbdadm -- --force create-md r0")[1].read())
|
186 : |
|
|
log.debug(vcutil.execute("drbd-overview"))
|
187 : |
|
|
log.debug(ssh.exec_command("drbdadm up r0")[1].read())
|
188 : |
|
|
output = vcutil.execute("drbd-overview")
|
189 : |
|
|
log.debug(output)
|
190 : |
|
|
log.info("Master - Connecting DRBD")
|
191 : |
|
|
while not "Connected" in output[0]:
|
192 : |
|
|
time.sleep(2)
|
193 : |
|
|
log.debug(vcutil.execute("drbdadm down r0"))
|
194 : |
|
|
log.debug(vcutil.execute("drbdadm -- --force wipe-md r0"))
|
195 : |
|
|
log.debug(vcutil.execute("/etc/init.d/drbd restart"))
|
196 : |
|
|
log.debug(vcutil.execute("drbdadm -- --force create-md r0"))
|
197 : |
|
|
log.debug(vcutil.execute("drbdadm up r0"))
|
198 : |
|
|
output = vcutil.execute("drbd-overview")
|
199 : |
|
|
log.debug(output)
|
200 : |
|
|
while "WFConnection" in output[0]:
|
201 : |
|
|
time.sleep(2)
|
202 : |
|
|
output = vcutil.execute("drbd-overview")
|
203 : |
|
|
log.debug(output)
|
204 : |
|
|
|
205 : |
|
|
log.info("Master - Syncing volumes...")
|
206 : |
|
|
log.debug(vcutil.execute("drbd-overview"))
|
207 : |
|
|
log.debug(vcutil.execute("drbdadm -- --clear-bitmap new-current-uuid r0"))
|
208 : |
|
|
log.debug(vcutil.execute("drbd-overview"))
|
209 : |
|
|
log.debug(vcutil.execute("drbdadm primary r0"))
|
210 : |
|
|
log.debug(vcutil.execute("drbd-overview"))
|
211 : |
|
|
#log.debug(vcutil.execute("drbdadm -- --overwrite-data-of-peer primary all"))
|
212 : |
|
|
#output = vcutil.execute("drbd-overview")
|
213 : |
|
|
#log.debug(output)
|
214 : |
|
|
#while not "UpToDate/UpToDate" in output[0]:
|
215 : |
|
|
# time.sleep(10)
|
216 : |
|
|
# output = vcutil.execute("drbd-overview")
|
217 : |
|
|
# log.debug(output)
|
218 : |
|
|
|
219 : |
|
|
log.info("Shadow - Setting both as primary...")
|
220 : |
|
|
log.debug(ssh.exec_command("drbdadm primary all")[1].read())
|
221 : |
|
|
|
222 : |
|
|
#Install OCFS & NFS
|
223 : |
|
|
log.info("Master - Configuring OCFS")
|
224 : |
|
|
with open("/etc/ocfs2/cluster.conf",'w') as file:
|
225 : |
|
|
file.write(ocfs_conf)
|
226 : |
|
|
log.debug(vcutil.execute("""/etc/init.d/ocfs2 restart
|
227 : |
|
|
/etc/init.d/o2cb restart
|
228 : |
|
|
yes | mkfs -t ocfs2 -N 2 -F -L ocfs2_drbd1 /dev/drbd1
|
229 : |
|
|
mkdir -p /volume
|
230 : |
|
|
mount /dev/drbd1 /volume"""))
|
231 : |
|
|
else:
|
232 : |
|
|
log.info("Master - Formatting volume")
|
233 : |
|
|
log.debug(vcutil.execute("""mkfs -t ext3 /dev/xvde
|
234 : |
|
|
mkdir -p /volume
|
235 : |
|
|
mount /dev/xvde /volume
|
236 : |
|
|
mkdir /shared
|
237 : |
|
|
mount --bind /volume /shared"""))
|
238 : |
|
|
|
239 : |
|
|
log.info("Master - Configuring NFS")
|
240 : |
|
|
log.debug(vcutil.execute("""echo '/volume 172.18.0.0/16(rw,sync,no_root_squash,no_subtree_check)' >> /etc/exports
|
241 : |
|
|
/etc/init.d/nfs-kernel-server restart"""))
|
242 : |
|
|
|
243 : |
|
|
if shadow_host:
|
244 : |
|
|
log.info("Master - Re-mounting volumes...")
|
245 : |
|
|
log.debug(vcutil.execute("""mkdir -p /shared
|
246 : |
|
|
mount -t nfs -o nordirplus,hard,nointr,rw %s:/volume /shared""" % master_host[1]))
|
247 : |
|
|
|
248 : |
|
|
log.info("Shadow - Configuring OCFS")
|
249 : |
|
|
file = sftp.open("/etc/ocfs2/cluster.conf",'w')
|
250 : |
|
|
file.write(ocfs_conf)
|
251 : |
|
|
file.close()
|
252 : |
|
|
log.debug(ssh.exec_command("""/etc/init.d/ocfs2 restart
|
253 : |
|
|
/etc/init.d/o2cb restart
|
254 : |
|
|
mkdir -p /volume
|
255 : |
|
|
mount /dev/drbd1 /volume""")[1].read())
|
256 : |
|
|
|
257 : |
|
|
log.info("Shadow - Configuring NFS")
|
258 : |
|
|
log.debug(ssh.exec_command("""echo '/volume 172.18.0.0/16(rw,sync,no_root_squash,no_subtree_check)' >> /etc/exports
|
259 : |
|
|
/etc/init.d/nfs-kernel-server restart""")[1].read())
|
260 : |
|
|
|
261 : |
|
|
log.info("Shadow - Re-mounting volumes...")
|
262 : |
|
|
log.debug(ssh.exec_command("""mkdir -p /shared
|
263 : |
|
|
mount -t nfs -o nordirplus,hard,nointr,rw %s:/volume /shared
|
264 : |
|
|
df -h""" % shadow_host[1])[1].read())
|
265 : |
|
|
|
266 : |
|
|
#Install OGS & VC scripts
|
267 : |
|
|
log.info("Master - Installing OGS...")
|
268 : |
|
|
log.debug(vcutil.execute("""mkdir -p /shared/ogs
|
269 : |
|
|
mkdir -p /shared/home
|
270 : |
|
|
df -h"""))
|
271 : |
|
|
with open("/shared/ogs/ogs.conf",'w') as file:
|
272 : |
|
|
file.write(ogs_conf)
|
273 : |
|
|
log.debug(vcutil.execute("""export SGE_ROOT=/shared/ogs
|
274 : |
|
|
tar xzf /root/sge_root.tar.gz -C $SGE_ROOT/..
|
275 : |
|
|
chown -R root:root $SGE_ROOT
|
276 : |
|
|
cd $SGE_ROOT
|
277 : |
|
|
./inst_sge -m -auto $SGE_ROOT/ogs.conf
|
278 : |
|
|
echo \"source $SGE_ROOT/default/common/settings.sh\" >> /root/.bashrc
|
279 : |
|
|
echo \"export SGE_CHECK_INTERVAL=45\" >> /root/.bashrc
|
280 : |
|
|
echo \"export SGE_GET_ACTIVE_INTERVAL=90\" >> /root/.bashrc
|
281 : |
|
|
echo \"export SGE_DELAY_TIME=120\" >> /root/.bashrc
|
282 : |
|
|
cp $SGE_ROOT/default/common/sgemaster /etc/init.d"""))
|
283 : |
|
|
|
284 : |
|
|
log.info("Master - Initializing Virtual Cluster script...")
|
285 : |
|
|
# vcutil.execute(". /shared/ogs/default/common/settings.sh; python -u /root/vc/vc-main-host-updater.py &> /var/log/vc-main-host-updater.log", fork = True)
|
286 : |
|
|
vcutil.execute(". /root/.bashrc; python -u /root/vc/vc-main-host-updater.py /var/log/vc-main-host-updater.log", fork = True)
|
287 : |
|
|
|
288 : |
|
|
if shadow_host:
|
289 : |
|
|
log.info("Shadow - Installing OGS...")
|
290 : |
|
|
log.debug(ssh.exec_command("""export SGE_ROOT=/shared/ogs
|
291 : |
|
|
cd $SGE_ROOT
|
292 : |
|
|
./inst_sge -sm -auto $SGE_ROOT/ogs.conf
|
293 : |
|
|
echo \". $SGE_ROOT/default/common/settings.sh\" >> /root/.bashrc
|
294 : |
|
|
echo \"export SGE_CHECK_INTERVAL=45\" >> /root/.bashrc
|
295 : |
|
|
echo \"export SGE_GET_ACTIVE_INTERVAL=90\" >> /root/.bashrc
|
296 : |
|
|
echo \"export SGE_DELAY_TIME=120\" >> /root/.bashrc
|
297 : |
|
|
cp $SGE_ROOT/default/common/sgemaster /etc/init.d""")[1].read())
|
298 : |
|
|
|
299 : |
|
|
log.info("Shadow - Initializing Virtual Cluster script...")
|
300 : |
|
|
log.debug(ssh.exec_command("nohup < /dev/null python -u /root/vc/vc-main-host-updater.py /var/log/vc-main-host-updater.log &")[1].read())
|
301 : |
|
|
else:
|
302 : |
|
|
log.info("I am shadow node, additional startup actions for me will be performed by master node")
|
303 : |
|
|
except Exception as excpt:
|
304 : |
|
|
log.exception(excpt)
|