#!/usr/bin/python
#
# BonFIRE Virtual Clusters on Federated Clouds Demonstration Kit
#
# Copyright (c) Fundacion Centro Tecnologico de Supercomputacion de Galicia 2012
#
# License Apache Software
#
# The research leading to these results has received funding from
# the European Community's Seventh Framework Programme (FP7/2007-2013)
# under agreement number 257386
#
# This software is provided with ABSOLUTELY NO WARRANTY
#
import os
import os.path
import subprocess
import sys
import time
from logger import log,configure
if len(sys.argv)>1:
configure(logfile = sys.argv[1], debug=True, console=False)
else:
configure(debug=True, console=True)
try:
import bonfire
import hostsfile
import ogs
import vcutil
allow_single_cluster = True
default_bonfire = "/etc/default/bonfire"
bonfire.read_defaults(default_bonfire)
hosts_file = "/etc/hosts"
log.info("Bonfire URI: %s" % bonfire.uri)
log.info("Credentials: %s:%s" % (bonfire.user,bonfire.password,))
log.info("Experiment: %s" % bonfire.experiment_id)
#Basic hostsfile for main nodes
master_host = None
shadow_host = None
log.info("Get experiment")
while (master_host == None) and ((not allow_single_cluster) or (shadow_host == None)) :
experiment = bonfire.get_experiment(bonfire.experiment_id)
for compute in experiment['computes']:
if compute["hostname"].startswith("master"):
master_host = (compute['ip'],compute['hostname'])
elif compute["hostname"].startswith("shadow"):
shadow_host = (compute['ip'],compute['hostname'])
main_hosts = [('127.0.0.1','localhost'), master_host]
if shadow_host:
main_hosts.append(shadow_host)
log.debug(main_hosts)
hostsfile.store(hosts_file,main_hosts)
#Initialize volume
#log.info "Zeroing volume"
#log.info vcutil.execute("dd if=/dev/zero of=/dev/xvde bs=1M ",ignore_error=True)
#Destroy previous file system
# log.info("Destroying file system")
# log.info(vcutil.execute("dd if=/dev/zero of=/dev/xvde bs=1M count=10",ignore_error=True))
if bonfire.hostname.startswith("master"):
log.info("I am master node")
backup_hostname = ""
if shadow_host:
backup_hostname = shadow_host[1]
ogs_conf = """
SGE_ROOT=\"/shared/ogs\"
SGE_QMASTER_PORT=\"6444\"
SGE_EXECD_PORT=\"6445\"
SGE_ENABLE_SMF=\"false\"
SGE_ENABLE_ST=\"false\"
SGE_CLUSTER_NAME=\"virtual\"
CELL_NAME=\"default\"
ADMIN_USER=\"\"
QMASTER_SPOOL_DIR=\"/shared/ogs/default/spool/qmaster\"
EXECD_SPOOL_DIR=\"/shared/ogs/default/spool\"
GID_RANGE=\"20000-20100\"
SPOOLING_METHOD=\"classic\"
DB_SPOOLING_SERVER=\"none\"
PAR_EXECD_INST_COUNT=\"20\"
ADMIN_HOST_LIST=\"%s\"
SUBMIT_HOST_LIST=\"%s %s\"
EXEC_HOST_LIST=\"\"
EXECD_SPOOL_DIR_LOCAL=\"\"
COPY_COMMAND=\"scp\"
DEFAULT_DOMAIN=\"none\"
ADMIN_MAIL=\"none\"
ADD_TO_RC=\"false\"
SET_FILE_PERMS=\"true\"
RESCHEDULE_JOBS=\"wait\"
SCHEDD_CONF=\"1\"
SHADOW_HOST=\"%s\"
REMOVE_RC=\"false\"
HOSTNAME_RESOLVING=\"true\"
""" % (master_host[1], master_host[1], backup_hostname, backup_hostname)
if shadow_host:
drbd_global = """global {
usage-count no;
}
common {
protocol C;
meta-disk internal;
syncer {
rate 10M;
}
disk {
on-io-error detach;
}
}"""
drbd_res = """resource r0 {
net {
allow-two-primaries;
after-sb-0pri disconnect;
after-sb-1pri disconnect;
after-sb-2pri disconnect;
}
on %s {
device /dev/drbd1;
disk /dev/xvde;
address %s:7789;
meta-disk internal;
}
on %s {
device /dev/drbd1;
disk /dev/xvde;
address %s:7789;
meta-disk internal;
}
}""" % (master_host[1],master_host[0],shadow_host[1],shadow_host[0])
ocfs_conf = """cluster:
node_count = 2
name = ocfs2
node:
ip_port = 7777
ip_address = %s
number = 0
name = %s
cluster = ocfs2
node:
ip_port = 7777
ip_address = %s
number = 1
name = %s
cluster = ocfs2""" % (master_host[0],master_host[1],shadow_host[0],shadow_host[1])
#Install DRBD
log.info("Master - Configure DRBD")
with open("/etc/drbd.d/global_common.conf",'w') as file:
file.write(drbd_global)
with open("/etc/drbd.d/r0.res",'w') as file:
file.write(drbd_res)
log.debug(vcutil.execute("modprobe drbd"))
log.info("Master - Prepare SSH access to shadow")
import paramiko
ssh = paramiko.SSHClient()
ssh.load_system_host_keys()
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
sftp = None
while True:
try:
# ssh.connect(shadow_host[1],key_filename="/root/.ssh/id_rsa")
ssh.connect(shadow_host[1],username="root",password="bonfire",timeout=10)
sftp = ssh.open_sftp()
log.info("Connected to: %s" % str(ssh.exec_command("hostname")[1].read()))
break
except Exception as err:
log.info("Shadow not available: ")
log.exception(err)
time.sleep(10)
log.info("Shadow - Configure DRBD")
#log.info("Fill with %i blocks of 1M" % dev_size)
#log.info(ssh.exec_command("dd if=/dev/zero of=/dev/xvde bs=1M count=%i" % dev_size)[1].read())
#log.info(ssh.exec_command("dd if=/dev/zero of=/dev/xvde bs=1M" )[1].read())
#log.info("Destroying file system")
#log.info(ssh.exec_command("dd if=/dev/zero of=/dev/xvde bs=1M count=10")[1].read())
file = sftp.open("/etc/drbd.d/global_common.conf",'w')
file.write(drbd_global)
file.close()
file = sftp.open("/etc/drbd.d/r0.res",'w')
file.write(drbd_res)
file.close()
log.debug(ssh.exec_command("modprobe drbd")[1].read())
log.debug(ssh.exec_command("/etc/init.d/drbd restart")[1].read())
log.debug(ssh.exec_command("drbdadm -- --force create-md r0")[1].read())
log.debug(vcutil.execute("drbd-overview"))
log.debug(ssh.exec_command("drbdadm up r0")[1].read())
output = vcutil.execute("drbd-overview")
log.debug(output)
log.info("Master - Connecting DRBD")
while not "Connected" in output[0]:
time.sleep(2)
log.debug(vcutil.execute("drbdadm down r0"))
log.debug(vcutil.execute("drbdadm -- --force wipe-md r0"))
log.debug(vcutil.execute("/etc/init.d/drbd restart"))
log.debug(vcutil.execute("drbdadm -- --force create-md r0"))
log.debug(vcutil.execute("drbdadm up r0"))
output = vcutil.execute("drbd-overview")
log.debug(output)
while "WFConnection" in output[0]:
time.sleep(2)
output = vcutil.execute("drbd-overview")
log.debug(output)
log.info("Master - Syncing volumes...")
log.debug(vcutil.execute("drbd-overview"))
log.debug(vcutil.execute("drbdadm -- --clear-bitmap new-current-uuid r0"))
log.debug(vcutil.execute("drbd-overview"))
log.debug(vcutil.execute("drbdadm primary r0"))
log.debug(vcutil.execute("drbd-overview"))
#log.debug(vcutil.execute("drbdadm -- --overwrite-data-of-peer primary all"))
#output = vcutil.execute("drbd-overview")
#log.debug(output)
#while not "UpToDate/UpToDate" in output[0]:
# time.sleep(10)
# output = vcutil.execute("drbd-overview")
# log.debug(output)
log.info("Shadow - Setting both as primary...")
log.debug(ssh.exec_command("drbdadm primary all")[1].read())
#Install OCFS & NFS
log.info("Master - Configuring OCFS")
with open("/etc/ocfs2/cluster.conf",'w') as file:
file.write(ocfs_conf)
log.debug(vcutil.execute("""/etc/init.d/ocfs2 restart
/etc/init.d/o2cb restart
yes | mkfs -t ocfs2 -N 2 -F -L ocfs2_drbd1 /dev/drbd1
mkdir -p /volume
mount /dev/drbd1 /volume"""))
else:
log.info("Master - Formatting volume")
log.debug(vcutil.execute("""mkfs -t ext3 /dev/xvde
mkdir -p /volume
mount /dev/xvde /volume
mkdir /shared
mount --bind /volume /shared"""))
log.info("Master - Configuring NFS")
log.debug(vcutil.execute("""echo '/volume 172.18.0.0/16(rw,sync,no_root_squash,no_subtree_check)' >> /etc/exports
/etc/init.d/nfs-kernel-server restart"""))
if shadow_host:
log.info("Master - Re-mounting volumes...")
log.debug(vcutil.execute("""mkdir -p /shared
mount -t nfs -o nordirplus,hard,nointr,rw %s:/volume /shared""" % master_host[1]))
log.info("Shadow - Configuring OCFS")
file = sftp.open("/etc/ocfs2/cluster.conf",'w')
file.write(ocfs_conf)
file.close()
log.debug(ssh.exec_command("""/etc/init.d/ocfs2 restart
/etc/init.d/o2cb restart
mkdir -p /volume
mount /dev/drbd1 /volume""")[1].read())
log.info("Shadow - Configuring NFS")
log.debug(ssh.exec_command("""echo '/volume 172.18.0.0/16(rw,sync,no_root_squash,no_subtree_check)' >> /etc/exports
/etc/init.d/nfs-kernel-server restart""")[1].read())
log.info("Shadow - Re-mounting volumes...")
log.debug(ssh.exec_command("""mkdir -p /shared
mount -t nfs -o nordirplus,hard,nointr,rw %s:/volume /shared
df -h""" % shadow_host[1])[1].read())
#Install OGS & VC scripts
log.info("Master - Installing OGS...")
log.debug(vcutil.execute("""mkdir -p /shared/ogs
mkdir -p /shared/home
df -h"""))
with open("/shared/ogs/ogs.conf",'w') as file:
file.write(ogs_conf)
log.debug(vcutil.execute("""export SGE_ROOT=/shared/ogs
tar xzf /root/sge_root.tar.gz -C $SGE_ROOT/..
chown -R root:root $SGE_ROOT
cd $SGE_ROOT
./inst_sge -m -auto $SGE_ROOT/ogs.conf
echo \"source $SGE_ROOT/default/common/settings.sh\" >> /root/.bashrc
echo \"export SGE_CHECK_INTERVAL=45\" >> /root/.bashrc
echo \"export SGE_GET_ACTIVE_INTERVAL=90\" >> /root/.bashrc
echo \"export SGE_DELAY_TIME=120\" >> /root/.bashrc
cp $SGE_ROOT/default/common/sgemaster /etc/init.d"""))
log.info("Master - Initializing Virtual Cluster script...")
# vcutil.execute(". /shared/ogs/default/common/settings.sh; python -u /root/vc/vc-main-host-updater.py &> /var/log/vc-main-host-updater.log", fork = True)
vcutil.execute(". /root/.bashrc; python -u /root/vc/vc-main-host-updater.py /var/log/vc-main-host-updater.log", fork = True)
if shadow_host:
log.info("Shadow - Installing OGS...")
log.debug(ssh.exec_command("""export SGE_ROOT=/shared/ogs
cd $SGE_ROOT
./inst_sge -sm -auto $SGE_ROOT/ogs.conf
echo \". $SGE_ROOT/default/common/settings.sh\" >> /root/.bashrc
echo \"export SGE_CHECK_INTERVAL=45\" >> /root/.bashrc
echo \"export SGE_GET_ACTIVE_INTERVAL=90\" >> /root/.bashrc
echo \"export SGE_DELAY_TIME=120\" >> /root/.bashrc
cp $SGE_ROOT/default/common/sgemaster /etc/init.d""")[1].read())
log.info("Shadow - Initializing Virtual Cluster script...")
log.debug(ssh.exec_command("nohup < /dev/null python -u /root/vc/vc-main-host-updater.py /var/log/vc-main-host-updater.log &")[1].read())
else:
log.info("I am shadow node, additional startup actions for me will be performed by master node")
except Exception as excpt:
log.exception(excpt)