#!/usr/bin/python # # BonFIRE Virtual Clusters on Federated Clouds Demonstration Kit # # Copyright (c) Fundacion Centro Tecnologico de Supercomputacion de Galicia 2012 # # License GPL Version 3 # # The research leading to these results has received funding from # the European Community's Seventh Frameqork Programme (FP7/2007-2013) # under agreement number 257386 # # This software is provided with ABSOLUTELY NO WARRANTY # import os import os.path import subprocess import sys import time from logger import log,configure if len(sys.argv)>1: configure(logfile = sys.argv[1], debug=True, console=False) else: configure(debug=True, console=True) try: import bonfire import hostsfile import ogs import vcutil allow_single_cluster = True default_bonfire = "/etc/default/bonfire" bonfire.read_defaults(default_bonfire) hosts_file = "/etc/hosts" log.info("Bonfire URI: %s" % bonfire.uri) log.info("Credentials: %s:%s" % (bonfire.user,bonfire.password,)) log.info("Experiment: %s" % bonfire.experiment_id) #Basic hostsfile for main nodes master_host = None shadow_host = None log.info("Get experiment") while (master_host == None) and ((not allow_single_cluster) or (shadow_host == None)) : experiment = bonfire.get_experiment(bonfire.experiment_id) for compute in experiment['computes']: if compute["hostname"].startswith("master"): master_host = (compute['ip'],compute['hostname']) elif compute["hostname"].startswith("shadow"): shadow_host = (compute['ip'],compute['hostname']) main_hosts = [('127.0.0.1','localhost'), master_host] if shadow_host: main_hosts.append(shadow_host) log.debug(main_hosts) hostsfile.store(hosts_file,main_hosts) #Initialize volume #log.info "Zeroing volume" #log.info vcutil.execute("dd if=/dev/zero of=/dev/xvde bs=1M ",ignore_error=True) #Destroy previous file system # log.info("Destroying file system") # log.info(vcutil.execute("dd if=/dev/zero of=/dev/xvde bs=1M count=10",ignore_error=True)) if bonfire.hostname.startswith("master"): log.info("I am master node") backup_hostname = "" if shadow_host: backup_hostname = shadow_host[1] ogs_conf = """ SGE_ROOT=\"/shared/ogs\" SGE_QMASTER_PORT=\"6444\" SGE_EXECD_PORT=\"6445\" SGE_ENABLE_SMF=\"false\" SGE_ENABLE_ST=\"false\" SGE_CLUSTER_NAME=\"virtual\" CELL_NAME=\"default\" ADMIN_USER=\"\" QMASTER_SPOOL_DIR=\"/shared/ogs/default/spool/qmaster\" EXECD_SPOOL_DIR=\"/shared/ogs/default/spool\" GID_RANGE=\"20000-20100\" SPOOLING_METHOD=\"classic\" DB_SPOOLING_SERVER=\"none\" PAR_EXECD_INST_COUNT=\"20\" ADMIN_HOST_LIST=\"%s\" SUBMIT_HOST_LIST=\"%s %s\" EXEC_HOST_LIST=\"\" EXECD_SPOOL_DIR_LOCAL=\"\" COPY_COMMAND=\"scp\" DEFAULT_DOMAIN=\"none\" ADMIN_MAIL=\"none\" ADD_TO_RC=\"false\" SET_FILE_PERMS=\"true\" RESCHEDULE_JOBS=\"wait\" SCHEDD_CONF=\"1\" SHADOW_HOST=\"%s\" REMOVE_RC=\"false\" HOSTNAME_RESOLVING=\"true\" """ % (master_host[1], master_host[1], backup_hostname, backup_hostname) if shadow_host: drbd_global = """global { usage-count no; } common { protocol C; meta-disk internal; syncer { rate 10M; } disk { on-io-error detach; } }""" drbd_res = """resource r0 { net { allow-two-primaries; after-sb-0pri disconnect; after-sb-1pri disconnect; after-sb-2pri disconnect; } on %s { device /dev/drbd1; disk /dev/xvde; address %s:7789; meta-disk internal; } on %s { device /dev/drbd1; disk /dev/xvde; address %s:7789; meta-disk internal; } }""" % (master_host[1],master_host[0],shadow_host[1],shadow_host[0]) ocfs_conf = """cluster: node_count = 2 name = ocfs2 node: ip_port = 7777 ip_address = %s number = 0 name = %s cluster = ocfs2 node: ip_port = 7777 ip_address = %s number = 1 name = %s cluster = ocfs2""" % (master_host[0],master_host[1],shadow_host[0],shadow_host[1]) #Install DRBD log.info("Master - Configure DRBD") with open("/etc/drbd.d/global_common.conf",'w') as file: file.write(drbd_global) with open("/etc/drbd.d/r0.res",'w') as file: file.write(drbd_res) log.debug(vcutil.execute("modprobe drbd")) log.info("Master - Prepare SSH access to shadow") import paramiko ssh = paramiko.SSHClient() ssh.load_system_host_keys() ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) sftp = None while True: try: # ssh.connect(shadow_host[1],key_filename="/root/.ssh/id_rsa") ssh.connect(shadow_host[1],username="root",password="bonfire",timeout=10) sftp = ssh.open_sftp() log.info("Connected to: %s" % str(ssh.exec_command("hostname")[1].read())) break except Exception as err: log.info("Shadow not available: ") log.exception(err) time.sleep(10) log.info("Shadow - Configure DRBD") #log.info("Fill with %i blocks of 1M" % dev_size) #log.info(ssh.exec_command("dd if=/dev/zero of=/dev/xvde bs=1M count=%i" % dev_size)[1].read()) #log.info(ssh.exec_command("dd if=/dev/zero of=/dev/xvde bs=1M" )[1].read()) #log.info("Destroying file system") #log.info(ssh.exec_command("dd if=/dev/zero of=/dev/xvde bs=1M count=10")[1].read()) file = sftp.open("/etc/drbd.d/global_common.conf",'w') file.write(drbd_global) file.close() file = sftp.open("/etc/drbd.d/r0.res",'w') file.write(drbd_res) file.close() log.debug(ssh.exec_command("modprobe drbd")[1].read()) log.debug(ssh.exec_command("/etc/init.d/drbd restart")[1].read()) log.debug(ssh.exec_command("drbdadm -- --force create-md r0")[1].read()) log.debug(vcutil.execute("drbd-overview")) log.debug(ssh.exec_command("drbdadm up r0")[1].read()) output = vcutil.execute("drbd-overview") log.debug(output) log.info("Master - Connecting DRBD") while not "Connected" in output[0]: time.sleep(2) log.debug(vcutil.execute("drbdadm down r0")) log.debug(vcutil.execute("drbdadm -- --force wipe-md r0")) log.debug(vcutil.execute("/etc/init.d/drbd restart")) log.debug(vcutil.execute("drbdadm -- --force create-md r0")) log.debug(vcutil.execute("drbdadm up r0")) output = vcutil.execute("drbd-overview") log.debug(output) while "WFConnection" in output[0]: time.sleep(2) output = vcutil.execute("drbd-overview") log.debug(output) log.info("Master - Syncing volumes...") log.debug(vcutil.execute("drbd-overview")) log.debug(vcutil.execute("drbdadm -- --clear-bitmap new-current-uuid r0")) log.debug(vcutil.execute("drbd-overview")) log.debug(vcutil.execute("drbdadm primary r0")) log.debug(vcutil.execute("drbd-overview")) #log.debug(vcutil.execute("drbdadm -- --overwrite-data-of-peer primary all")) #output = vcutil.execute("drbd-overview") #log.debug(output) #while not "UpToDate/UpToDate" in output[0]: # time.sleep(10) # output = vcutil.execute("drbd-overview") # log.debug(output) log.info("Shadow - Setting both as primary...") log.debug(ssh.exec_command("drbdadm primary all")[1].read()) #Install OCFS & NFS log.info("Master - Configuring OCFS") with open("/etc/ocfs2/cluster.conf",'w') as file: file.write(ocfs_conf) log.debug(vcutil.execute("""/etc/init.d/ocfs2 restart /etc/init.d/o2cb restart yes | mkfs -t ocfs2 -N 2 -F -L ocfs2_drbd1 /dev/drbd1 mkdir -p /volume mount /dev/drbd1 /volume""")) else: log.info("Master - Formatting volume") log.debug(vcutil.execute("""mkfs -t ext3 /dev/xvde mkdir -p /volume mount /dev/xvde /volume mkdir /shared mount --bind /volume /shared""")) log.info("Master - Configuring NFS") log.debug(vcutil.execute("""echo '/volume 172.18.0.0/16(rw,sync,no_root_squash,no_subtree_check)' >> /etc/exports /etc/init.d/nfs-kernel-server restart""")) if shadow_host: log.info("Master - Re-mounting volumes...") log.debug(vcutil.execute("""mkdir -p /shared mount -t nfs -o nordirplus,hard,nointr,rw %s:/volume /shared""" % master_host[1])) log.info("Shadow - Configuring OCFS") file = sftp.open("/etc/ocfs2/cluster.conf",'w') file.write(ocfs_conf) file.close() log.debug(ssh.exec_command("""/etc/init.d/ocfs2 restart /etc/init.d/o2cb restart mkdir -p /volume mount /dev/drbd1 /volume""")[1].read()) log.info("Shadow - Configuring NFS") log.debug(ssh.exec_command("""echo '/volume 172.18.0.0/16(rw,sync,no_root_squash,no_subtree_check)' >> /etc/exports /etc/init.d/nfs-kernel-server restart""")[1].read()) log.info("Shadow - Re-mounting volumes...") log.debug(ssh.exec_command("""mkdir -p /shared mount -t nfs -o nordirplus,hard,nointr,rw %s:/volume /shared df -h""" % shadow_host[1])[1].read()) #Install OGS & VC scripts log.info("Master - Installing OGS...") log.debug(vcutil.execute("""mkdir -p /shared/ogs mkdir -p /shared/home df -h""")) with open("/shared/ogs/ogs.conf",'w') as file: file.write(ogs_conf) log.debug(vcutil.execute("""export SGE_ROOT=/shared/ogs tar xzf /root/sge_root.tar.gz -C $SGE_ROOT/.. chown -R root:root $SGE_ROOT cd $SGE_ROOT ./inst_sge -m -auto $SGE_ROOT/ogs.conf echo \"source $SGE_ROOT/default/common/settings.sh\" >> /root/.bashrc echo \"export SGE_CHECK_INTERVAL=45\" >> /root/.bashrc echo \"export SGE_GET_ACTIVE_INTERVAL=90\" >> /root/.bashrc echo \"export SGE_DELAY_TIME=120\" >> /root/.bashrc cp $SGE_ROOT/default/common/sgemaster /etc/init.d""")) log.info("Master - Initializing Virtual Cluster script...") # vcutil.execute(". /shared/ogs/default/common/settings.sh; python -u /root/vc/vc-main-host-updater.py &> /var/log/vc-main-host-updater.log", fork = True) vcutil.execute(". /root/.bashrc; python -u /root/vc/vc-main-host-updater.py /var/log/vc-main-host-updater.log", fork = True) if shadow_host: log.info("Shadow - Installing OGS...") log.debug(ssh.exec_command("""export SGE_ROOT=/shared/ogs cd $SGE_ROOT ./inst_sge -sm -auto $SGE_ROOT/ogs.conf echo \". $SGE_ROOT/default/common/settings.sh\" >> /root/.bashrc echo \"export SGE_CHECK_INTERVAL=45\" >> /root/.bashrc echo \"export SGE_GET_ACTIVE_INTERVAL=90\" >> /root/.bashrc echo \"export SGE_DELAY_TIME=120\" >> /root/.bashrc cp $SGE_ROOT/default/common/sgemaster /etc/init.d""")[1].read()) log.info("Shadow - Initializing Virtual Cluster script...") log.debug(ssh.exec_command("nohup < /dev/null python -u /root/vc/vc-main-host-updater.py /var/log/vc-main-host-updater.log &")[1].read()) else: log.info("I am shadow node, additional startup actions for me will be performed by master node") except Exception as excpt: log.exception(excpt)