From f20cf2936fed4cdcb83cd87b80811acadcf5050d Mon Sep 17 00:00:00 2001 From: Alexander Solganik Date: Sun, 25 Jan 2015 22:59:54 +0200 Subject: [PATCH 01/27] adding tools to rootfs --- Makefile | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 565db55..7fbfc85 100644 --- a/Makefile +++ b/Makefile @@ -10,7 +10,7 @@ endif all: eggs $(ROOTFS) check_convention check_convention: - pep8 py --max-line-length=109 + pep8 py --max-line-length=150 submit: sudo -E solvent submitproduct rootfs $(ROOTFS) @@ -43,6 +43,9 @@ $(ROOTFS): build/smartctl sudo cp ../inaugurator/dist/inaugurator-1.0-py2.7.egg $(ROOTFS).tmp/tmp sudo chroot $(ROOTFS).tmp easy_install /tmp/inaugurator-1.0-py2.7.egg sudo chroot $(ROOTFS).tmp yum install --assumeyes $(YUMCACHE)/mirrors.kernel.org/fedora-epel/7/x86_64/m/msr-tools-1.3-1.el7.x86_64.rpm + sudo chroot $(ROOTFS).tmp yum install --assumeyes $(YUMCACHE)/mirrors.kernel.org/fedora-epel/7/x86_64/v/vconfig-1.9-16.el7.x86_64.rpm + sudo chroot $(ROOTFS).tmp pip install rpdb + sudo chroot $(ROOTFS).tmp pip install ipaddr sudo cp $< $(ROOTFS).tmp/usr/sbin/ sudo rm -fr $(ROOTFS).tmp/tmp/* sudo mv $(ROOTFS).tmp $(ROOTFS) @@ -69,5 +72,7 @@ RPMS_TO_INSTALL = \ strace \ zip \ tcpdump \ - unzip + unzip \ + pciutils \ + nmap-ncat From f93cb848f6998ce8f140452a5253e1e4f765c9e1 Mon Sep 17 00:00:00 2001 From: Alexander Solganik Date: Tue, 27 Jan 2015 08:24:32 +0200 Subject: [PATCH 02/27] making innaugurator a seed. This way it is easier to debug, and looks more like all other stuff we do in strato. we will use our own infrastructure for the tools. --- Makefile | 13 +- py/rackattack/dryrun/master/__init__.py | 0 py/rackattack/dryrun/master/main.py | 88 ------------ py/rackattack/dryrun/master/network.py | 26 ---- py/rackattack/dryrun/seeds/innaugurator.py | 153 +++++++++++++++++++++ py/rackattack/dryrun/seeds/network.py | 56 ++++++++ 6 files changed, 213 insertions(+), 123 deletions(-) delete mode 100644 py/rackattack/dryrun/master/__init__.py delete mode 100644 py/rackattack/dryrun/master/main.py delete mode 100644 py/rackattack/dryrun/master/network.py create mode 100644 py/rackattack/dryrun/seeds/innaugurator.py create mode 100644 py/rackattack/dryrun/seeds/network.py diff --git a/Makefile b/Makefile index 7fbfc85..56818aa 100644 --- a/Makefile +++ b/Makefile @@ -7,7 +7,7 @@ else Q = @ endif -all: eggs $(ROOTFS) check_convention +all: $(ROOTFS) check_convention check_convention: pep8 py --max-line-length=150 @@ -22,14 +22,9 @@ approve: sudo -E solvent approve --product=rootfs clean: - sudo rm -fr build - -eggs: build/master.egg - -build/master.egg: - $(Q)mkdir -p build - PYTHONPATH=py UPSETO_JOIN_PYTHON_NAMESPACES=yes python -m upseto.packegg --entryPoint=py/rackattack/dryrun/master/main.py --output=$@ --createDeps=$@.deps --takeSitePackages --joinPythonNamespaces --include build/master.egg.deps + @sudo rm -fr build + @find -name "*.pyc" -delete + $(ROOTFS): build/smartctl -sudo mv $(ROOTFS) $(ROOTFS).tmp diff --git a/py/rackattack/dryrun/master/__init__.py b/py/rackattack/dryrun/master/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/py/rackattack/dryrun/master/main.py b/py/rackattack/dryrun/master/main.py deleted file mode 100644 index d1f709a..0000000 --- a/py/rackattack/dryrun/master/main.py +++ /dev/null @@ -1,88 +0,0 @@ -import logging -logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') -import argparse -import threading -from rackattack.common import tftpboot -from rackattack.common import dnsmasq -from rackattack.common import inaugurate -from rackattack.physical import ipmi -from rackattack.physical import serialoverlan -from rackattack.dryrun.master import network -from rackattack.common import globallock -import time - -parser = argparse.ArgumentParser() -parser.add_argument("--hostID", required=True) -parser.add_argument("--macAddress", required=True) -parser.add_argument("--ipmiHost", required=True) -parser.add_argument("--ipmiUsername", required=True) -parser.add_argument("--ipmiPassword", required=True) -parser.add_argument("--osmosisServerIP", required=True) -parser.add_argument("--ipAddress", required=True) -parser.add_argument("--label", required=True) -args = parser.parse_args() - - -checkInEvent = threading.Event() -doneEvent = threading.Event() - - -def inaugurateCheckIn(): - logging.info("Inaugurator checked in") - inaugurateInstance.provideLabel(ipAddress=args.ipAddress, label=args.label) - checkInEvent.set() - - -def inaugurateDone(): - logging.info("Inaugurator done") - doneEvent.set() - - -network.dropFirewall() -logging.info("MyIP: %(ip)s", dict(ip=network.myIP())) - -tftpbootInstance = tftpboot.TFTPBoot( - netmask=network.netmask(), - inauguratorServerIP=network.myIP(), - osmosisServerIP=args.osmosisServerIP, - inauguratorGatewayIP=network.myIP(), - rootPassword="dryrun", - withLocalObjectStore=True) -dnsmasq.DNSMasq.eraseLeasesFile() -dnsmasq.DNSMasq.killAllPrevious() -dnsmasqInstance = dnsmasq.DNSMasq( - tftpboot=tftpbootInstance, - serverIP=network.myIP(), - netmask=network.netmask(), - firstIP=args.ipAddress, - lastIP=args.ipAddress, - gateway=network.gateway(), - nameserver=network.myIP()) -logging.info("Sleeping 1 second to let dnsmasq go up, so it can receive SIGHUP") -time.sleep(1) -logging.info("Done Sleeping 1 second to let dnsmasq go up, so it can receive SIGHUP") -inaugurateInstance = inaugurate.Inaugurate(bindHostname=network.myIP()) -with globallock.lock(): - dnsmasqInstance.add(args.macAddress, args.ipAddress) - inaugurateInstance.register( - ipAddress=args.ipAddress, - checkInCallback=inaugurateCheckIn, - doneCallback=inaugurateDone) - tftpbootInstance.configureForInaugurator(args.macAddress, args.ipAddress, clearDisk=True) -sol = serialoverlan.SerialOverLan(args.ipmiHost, args.ipmiUsername, args.ipmiPassword, args.hostID) -ipmiInstance = ipmi.IPMI(args.ipmiHost, args.ipmiUsername, args.ipmiPassword) -ipmiInstance.powerCycle() -try: - logging.info("Waiting for inaugurator to check in") - checkInEvent.wait(6 * 60) - if not checkInEvent.isSet(): - raise Exception("Timeout waiting for inaugurator to checkin") - logging.info("Inaugurator checked in, waiting for inaugurator to complete") - doneEvent.wait(7 * 60) - if not doneEvent.isSet(): - raise Exception("timeout waiting for inaugurator to be done") -except: - logging.info("Serial log was:\n%(log)s", dict(log=open(sol.serialLogFilename()).read())) - raise -finally: - ipmiInstance.off() diff --git a/py/rackattack/dryrun/master/network.py b/py/rackattack/dryrun/master/network.py deleted file mode 100644 index 56adf28..0000000 --- a/py/rackattack/dryrun/master/network.py +++ /dev/null @@ -1,26 +0,0 @@ -import re -import socket -import subprocess - - -def myIP(): - s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) - try: - s.connect(("1.1.1.1", 1000)) - return s.getsockname()[0] - finally: - s.close() - - -def netmask(): - output = subprocess.check_output(['ifconfig']) - return re.search(r"inet\s+%s\s+netmask\s+(\S+)\s" % myIP(), output).group(1) - - -def gateway(): - output = subprocess.check_output(['ip', 'route', 'show']) - return re.search(r"default\s+via\s+(\S+)\s", output).group(1) - - -def dropFirewall(): - subprocess.check_output(["iptables", "--flush"]) diff --git a/py/rackattack/dryrun/seeds/innaugurator.py b/py/rackattack/dryrun/seeds/innaugurator.py new file mode 100644 index 0000000..c9cb404 --- /dev/null +++ b/py/rackattack/dryrun/seeds/innaugurator.py @@ -0,0 +1,153 @@ +import logging +import socket +import functools +logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') +import argparse +import threading +from rackattack.common import tftpboot +from rackattack.common import dnsmasq +from rackattack.common import inaugurate +from rackattack.physical import ipmi +from rackattack.physical import serialoverlan +from rackattack.common import globallock +import time +import network + + +class Waiter: + + def __init__(self, nodes): + self.nodes = nodes + self.condition = threading.Condition() + + def notifyOne(self, checkedInNode): + self.condition.acquire() + self.nodes = [node for node in self.nodes if node is not checkedInNode] + if len(self.nodes) == 0: + self.condition.notifyAll() + self.condition.release() + + def waitAll(self, timeout=None): + self.condition.acquire() + self.condition.wait(timeout=timeout) + self.condition.release() + return self.nodes + + +def waitForTCPServer(hostname, port, timeout=60, interval=0.1): + before = time.time() + while time.time() - before < timeout: + if _rawTCPConnect((hostname, port)): + return + time.sleep(interval) + raise Exception("SSH TCP Server '%(hostname)s:%(port)s' did not respond within timeout" % dict(hostname=hostname, port=port)) + + +def _rawTCPConnect(tcpEndpoint): + s = socket.socket() + try: + s.connect(tcpEndpoint) + return True + except: + return False + finally: + s.close() + + +def inaugurateCheckIn(inaugurateInstance, innaguratedNode, rootfsLabel, notifier): + logging.info("Inaugurator checked in") + inaugurateInstance.provideLabel(ipAddress=innaguratedNode['ipAddress'], label=rootfsLabel) + notifier.notifyOne(innaguratedNode) + + +def inaugurateDone(innaguratedNode, notifier): + logging.info("Inaugurator checked in") + notifier.notifyOne(innaguratedNode) + + +def _prepareForInnauguration(dnsmasqInstance, inaugurateInstance, tftpbootInstance, + nodesToInnagurate, rootfsLabel, checkinWaiter, doneWaiter): + with globallock.lock(): + for nodeToInnaugurate in nodesToInnagurate: + dnsmasqInstance.add(nodeToInnaugurate['macAddress'], nodeToInnaugurate['ipAddress']) + checkInCallback = functools.partial(inaugurateCheckIn, + inaugurateInstance, + nodeToInnaugurate, + rootfsLabel, + checkinWaiter) + doneCallback = functools.partial(inaugurateDone, + innaguratedNode=nodeToInnaugurate, + notifier=doneWaiter) + inaugurateInstance.register(ipAddress=nodeToInnaugurate['ipAddress'], + checkInCallback=checkInCallback, + doneCallback=doneCallback) + tftpbootInstance.configureForInaugurator(nodeToInnaugurate['macAddress'], + nodeToInnaugurate['ipAddress'], + clearDisk=True) + + +def innaugurate(osmosisServerIP, rootfsLabel, nodesToInnagurate): + network.dropFirewall() + logging.info("MyIP: %(ip)s", dict(ip=network.myIP())) + + tftpbootInstance = tftpboot.TFTPBoot( + netmask=network.netmask(), + inauguratorServerIP=network.myIP(), + osmosisServerIP=osmosisServerIP, + inauguratorGatewayIP=network.myIP(), + rootPassword="dryrun", + withLocalObjectStore=True) + dnsmasq.DNSMasq.eraseLeasesFile() + dnsmasq.DNSMasq.killAllPrevious() + dnsmasqInstance = dnsmasq.DNSMasq( + tftpboot=tftpbootInstance, + serverIP=network.myIP(), + netmask=network.netmask(), + firstIP=network.myIP(), + lastIP=network.myIP(), + gateway=network.gateway(), + nameserver=network.myIP()) + + logging.info("Sleeping 1 second to let dnsmasq go up, so it can receive SIGHUP") + time.sleep(1) + logging.info("Done Sleeping 1 second to let dnsmasq go up, so it can receive SIGHUP") + inaugurateInstance = inaugurate.Inaugurate(bindHostname=network.myIP()) + + checkinWaiters = Waiter(nodesToInnagurate) + doneWaiters = Waiter(nodesToInnagurate) + _prepareForInnauguration(dnsmasqInstance, inaugurateInstance, + tftpbootInstance, nodesToInnagurate, rootfsLabel, + checkinWaiters, doneWaiters) + solReaders = dict() + for nodeToInnaugurate in nodesToInnagurate: + sol = serialoverlan.SerialOverLan(nodeToInnaugurate['ipmiHost'], + nodeToInnaugurate['ipmiUsername'], + nodeToInnaugurate['ipmiPassword'], + nodeToInnaugurate['hostID']) + solReaders[nodeToInnaugurate['macAddress']] = sol + + for nodeToInnaugurate in nodesToInnagurate: + ipmiInstance = ipmi.IPMI(nodeToInnaugurate['ipmiHost'], + nodeToInnaugurate['ipmiUsername'], + nodeToInnaugurate['ipmiPassword']) + ipmiInstance.powerCycle() + + logging.info("Waiting for inaugurator to check in") + remainingNodes = checkinWaiters.waitAll(timeout=6 * 60) + logging.error("Failed to checkin nodes %(nodes)s", dict(nodes=remainingNodes)) + remainingNodes = doneWaiters.waitAll(timeout=7 * 60) + logging.error("Failed to finish nodes %(nodes)s", dict(nodes=remainingNodes)) + + nodesToWaitForIp = [node for node in nodesToInnagurate if node not in remainingNodes] + # Now wait for all servers to obtain an IP + for nodeToInnaugurate in nodesToWaitForIp: + try: + waitForTCPServer(nodeToInnaugurate['ipAddress'], 22) + except: + logging.exception("Failed to wait for active ssh connection on %(node)s", + dict(node=nodesToInnagurate['hostID'])) + remainingNodes.append(nodeToInnaugurate) + + failedNodes = {node['hostID']: open(solReaders[node['macAddress']].serialLogFilename()).read() + for node in remainingNodes} + return failedNodes diff --git a/py/rackattack/dryrun/seeds/network.py b/py/rackattack/dryrun/seeds/network.py new file mode 100644 index 0000000..c2cbc1b --- /dev/null +++ b/py/rackattack/dryrun/seeds/network.py @@ -0,0 +1,56 @@ +import netifaces +import re +import subprocess +import socket + + +def _exec(command): + return subprocess.check_output(command, shell=True, stdin=open('/dev/null'), close_fds=True) + + +def configureStaticIPOnDevice(ip4Network, deviceName): + _exec('ip addr add %(ipmask)s dev %(deviceName)s' % dict(ipmask=ip4Network.with_prefixlen, deviceName=deviceName)) + + +def interfaces(): + nicsBySpeed = dict(slow=[], fast=[]) + # This is copied fron postinstaller + nics = [nic for nic in netifaces.interfaces() if nic.startswith('e') or nic.startswith('p')] + + for nic in nics: + ethtoolOutput = _exec('ethtool %s' % nic).split('\n\t') + speedString = ''.join([ethtoolLine for ethtoolLine in ethtoolOutput if ethtoolLine.startswith('Speed')]) + if not speedString or speedString == 'Speed: Unknown!': + continue + else: + speed = int(re.findall(r'\d+', speedString)[0]) + speedKey = 'fast' if speed > 1000 else 'slow' + macAddress = netifaces.ifaddresses(nic)[netifaces.AF_LINK][0]['addr'] + nicsBySpeed[speedKey].append((nic, speed, macAddress)) + return nicsBySpeed + + +def myIP(): + s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + try: + s.connect(("1.1.1.1", 1000)) + return s.getsockname()[0] + finally: + s.close() + + +def netmask(): + output = subprocess.check_output(['ifconfig']) + return re.search(r"inet\s+%s\s+netmask\s+(\S+)\s" % myIP(), output).group(1) + + +def gateway(): + output = subprocess.check_output(['ip', 'route', 'show']) + return re.search(r"default\s+via\s+(\S+)\s", output).group(1) + + +def dropFirewall(): + subprocess.check_output(["iptables", "--flush"]) + subprocess.check_output(["iptables", '-t', 'nat', "--flush"]) + subprocess.check_output(["iptables", "--delete-chain"]) + subprocess.check_output(["iptables", '-t', 'nat', "--delete-chain"]) From 3f529dfb5b84be92f3b5de6a29c7664b0064e405 Mon Sep 17 00:00:00 2001 From: Alexander Solganik Date: Tue, 27 Jan 2015 08:25:57 +0200 Subject: [PATCH 03/27] helper scripts to wait and return results --- py/rackattack/dryrun/common/__init__.py | 2 ++ .../dryrun/common/waitforpredicate.py | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+) create mode 100644 py/rackattack/dryrun/common/__init__.py create mode 100644 py/rackattack/dryrun/common/waitforpredicate.py diff --git a/py/rackattack/dryrun/common/__init__.py b/py/rackattack/dryrun/common/__init__.py new file mode 100644 index 0000000..48e952a --- /dev/null +++ b/py/rackattack/dryrun/common/__init__.py @@ -0,0 +1,2 @@ +import upseto.pythonnamespacejoin +__path__.extend(upseto.pythonnamespacejoin.join(globals())) diff --git a/py/rackattack/dryrun/common/waitforpredicate.py b/py/rackattack/dryrun/common/waitforpredicate.py new file mode 100644 index 0000000..47f3418 --- /dev/null +++ b/py/rackattack/dryrun/common/waitforpredicate.py @@ -0,0 +1,18 @@ +import time +import logging + + +class WaitForPredicate: + + def __init__(self, timeout=3, interval=0.1): + self._timeout = timeout + self._interval = interval + + def waitAndReturn(self, predicate, * args, ** kwargs): + before = time.time() + while time.time() - before < self._timeout: + ret = predicate(* args, ** kwargs) + if ret: + return ret + time.sleep(self._interval) + raise Exception("Predicate '%s' did not happen within timeout" % predicate) From 166cc6840d709a6b648e512c8c5babbc7e77721c Mon Sep 17 00:00:00 2001 From: Alexander Solganik Date: Tue, 27 Jan 2015 08:26:24 +0200 Subject: [PATCH 04/27] upsetoing to needed dependencies --- py/strato/__init__.py | 2 ++ py/strato/common/__init__.py | 2 ++ upseto.manifest | 8 ++++++-- 3 files changed, 10 insertions(+), 2 deletions(-) create mode 100644 py/strato/__init__.py create mode 100644 py/strato/common/__init__.py diff --git a/py/strato/__init__.py b/py/strato/__init__.py new file mode 100644 index 0000000..48e952a --- /dev/null +++ b/py/strato/__init__.py @@ -0,0 +1,2 @@ +import upseto.pythonnamespacejoin +__path__.extend(upseto.pythonnamespacejoin.join(globals())) diff --git a/py/strato/common/__init__.py b/py/strato/common/__init__.py new file mode 100644 index 0000000..48e952a --- /dev/null +++ b/py/strato/common/__init__.py @@ -0,0 +1,2 @@ +import upseto.pythonnamespacejoin +__path__.extend(upseto.pythonnamespacejoin.join(globals())) diff --git a/upseto.manifest b/upseto.manifest index 19a5e69..5ffc223 100644 --- a/upseto.manifest +++ b/upseto.manifest @@ -1,5 +1,9 @@ requirements: -- hash: e55359271d8f71f63f7c2cfa08f4ccf2ba0cbd99 +- hash: d5f7438c25437f8635d9485440f9db0516a3a10f + originURL: https://github.com/Stratoscale/pyracktest.git +- hash: ba5959a1a357fbe60db0e46cc4c5808b7ead97cf + originURL: https://github.com/Stratoscale/pycommonmultithreading.git +- hash: 6e98ac3fe49514e5c0b3255602721308f7dfe1f1 originURL: https://github.com/Stratoscale/rackattack-physical.git - hash: 389d66d1af1b7ee78000201b8b7c0defa66dbf6b - originURL: https://github.com/Stratoscale/inaugurator.git + originURL: https://github.com/Stratoscale/inaugurator.git \ No newline at end of file From 0cacb3e09dd9ef16c0162d9e1a09218d03f9d2f1 Mon Sep 17 00:00:00 2001 From: Alexander Solganik Date: Tue, 27 Jan 2015 08:27:01 +0200 Subject: [PATCH 05/27] using host and not to access innaugurated node under test We will use the same techniques we use in testing to access innaugurated node as it will make checks transparent to whether the other node is also one of the tested nodes or the 'master' --- py/rackattack/dryrun/dryrunhost.py | 61 ++++++++++++++++++++++++++++++ py/rackattack/dryrun/node.py | 23 +++++++++++ 2 files changed, 84 insertions(+) create mode 100644 py/rackattack/dryrun/dryrunhost.py create mode 100644 py/rackattack/dryrun/node.py diff --git a/py/rackattack/dryrun/dryrunhost.py b/py/rackattack/dryrun/dryrunhost.py new file mode 100644 index 0000000..4755b43 --- /dev/null +++ b/py/rackattack/dryrun/dryrunhost.py @@ -0,0 +1,61 @@ +from rackattack.ssh import connection +from strato.racktest.hostundertest import plugins + +import strato.racktest.hostundertest.builtinplugins.rpm +import strato.racktest.hostundertest.builtinplugins.seed +from rackattack import ssh +import paramiko + +from rackattack.ssh import ftp +from rackattack.ssh import run +from rackattack.ssh import dirftp +from rackattack.ssh import tunnel + + +class DryRunHost(object): + + def __init__(self, node, credentials): + self.name = node.name() + self.ssh = ProxySSHConnection(node.masterHost, node.ipAddress(), credentials) + self.__plugins = {} + self.node = node + + def __getattr__(self, name): + if name not in self.__plugins: + self.__plugins[name] = plugins.plugins[name](self) + return self.__plugins[name] + + +class ProxySSHConnection(object): + + def __init__(self, masterHost, destIp, credentials): + self._masterHost = masterHost + self._destIp = destIp + self._credentials = credentials + self._sshClient = None + + @property + def run(self): + return run.Run(self._sshClient) + + @property + def ftp(self): + return ftp.FTP(self._sshClient) + + @property + def dirFTP(self): + return dirftp.DirFTP(self._sshClient) + + def close(self): + self._sshClient.close() + self._sshClient = None + + def connect(self): + transport = self._masterHost.ssh._sshClient.get_transport() + dst = (self._destIp, 22) + src = ('127.0.0.1', 0) + commChannel = transport.open_channel("direct-tcpip", dst, src) + self._sshClient = paramiko.client.SSHClient() + self._sshClient.known_hosts = None + self._sshClient.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + self._sshClient.connect(src[0], port=src[1], sock=commChannel, **self._credentials) diff --git a/py/rackattack/dryrun/node.py b/py/rackattack/dryrun/node.py new file mode 100644 index 0000000..e50d7c4 --- /dev/null +++ b/py/rackattack/dryrun/node.py @@ -0,0 +1,23 @@ +from rackattack import api + + +class Node(api.Node): + + def __init__(self, name, masterHost, macAddress, ipAddress, nodeId): + self._ipAddress = ipAddress + self.masterHost = masterHost + self._name = name + self.nodeId = nodeId + self._primaryMacAddress = macAddress + + def ipAddress(self): + return self._ipAddress + + def name(self): + return self._name + + def id(self): + return self.nodeId + + def primaryMACAddress(self): + return self._primaryMacAddress From c809dc466619e2a91c263e0751d1f25e3cc2c612 Mon Sep 17 00:00:00 2001 From: Alexander Solganik Date: Tue, 27 Jan 2015 08:28:49 +0200 Subject: [PATCH 06/27] added tools need to testing we will test disk, virtualization enabling and network for now --- py/rackattack/dryrun/lib/__init__.py | 0 py/rackattack/dryrun/lib/cpuinfo.py | 19 +++++ py/rackattack/dryrun/plugins/__init__.py | 0 py/rackattack/dryrun/plugins/disk.py | 19 +++++ py/rackattack/dryrun/plugins/kernel.py | 44 +++++++++++ py/rackattack/dryrun/plugins/network.py | 93 ++++++++++++++++++++++++ py/rackattack/dryrun/seeds/__init__.py | 0 py/rackattack/dryrun/seeds/cpuinfo.py | 5 ++ 8 files changed, 180 insertions(+) create mode 100644 py/rackattack/dryrun/lib/__init__.py create mode 100644 py/rackattack/dryrun/lib/cpuinfo.py create mode 100644 py/rackattack/dryrun/plugins/__init__.py create mode 100644 py/rackattack/dryrun/plugins/disk.py create mode 100644 py/rackattack/dryrun/plugins/kernel.py create mode 100644 py/rackattack/dryrun/plugins/network.py create mode 100644 py/rackattack/dryrun/seeds/__init__.py create mode 100644 py/rackattack/dryrun/seeds/cpuinfo.py diff --git a/py/rackattack/dryrun/lib/__init__.py b/py/rackattack/dryrun/lib/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/py/rackattack/dryrun/lib/cpuinfo.py b/py/rackattack/dryrun/lib/cpuinfo.py new file mode 100644 index 0000000..8607812 --- /dev/null +++ b/py/rackattack/dryrun/lib/cpuinfo.py @@ -0,0 +1,19 @@ +class CpuInfo(dict): + + def __init__(self, cpuinfoString): + processor = {} + for cpuinfoLine in cpuinfoString.split('\n'): + if len(cpuinfoLine.strip()) == 0: + if len(processor.keys()) == 0: + continue + self[int(processor['processor'])] = processor + processor = {} + continue + (k, v) = cpuinfoLine.split(':') + processor[k.strip()] = v.strip() + + def hasFlag(self, processorNum, flagName): + return flagName in self[processorNum]['flags'] + + def hasVt(self, processorNum=0): + return self.hasFlag(processorNum, 'vmx') or self.hasFlag(processorNum, 'svm') diff --git a/py/rackattack/dryrun/plugins/__init__.py b/py/rackattack/dryrun/plugins/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/py/rackattack/dryrun/plugins/disk.py b/py/rackattack/dryrun/plugins/disk.py new file mode 100644 index 0000000..4246589 --- /dev/null +++ b/py/rackattack/dryrun/plugins/disk.py @@ -0,0 +1,19 @@ +from strato.racktest.hostundertest import plugins +from rackattack.dryrun.seeds import cpuinfo +from rackattack.dryrun.lib import cpuinfo as libcpuinfo +import logging + + +class Disk: + + def __init__(self, host): + self._host = host + + def smartctlStatus(self, deviceToCheck): + output = self._host.ssh.run.script("smartctl -H %(device)s" % dict(device=deviceToCheck)).strip() + return ('PASSED' in output, output) + + def rotational(self, deviceName): + return 1 == int(self._host.ssh.run.script('cat /sys/block/%(device)s/queue/rotational' % dict(device=deviceName)).strip()) + +plugins.register('disk', Disk) diff --git a/py/rackattack/dryrun/plugins/kernel.py b/py/rackattack/dryrun/plugins/kernel.py new file mode 100644 index 0000000..5f06557 --- /dev/null +++ b/py/rackattack/dryrun/plugins/kernel.py @@ -0,0 +1,44 @@ +from strato.racktest.hostundertest import plugins +from rackattack.dryrun.seeds import cpuinfo +from rackattack.dryrun.lib import cpuinfo as libcpuinfo +from strato.common.multithreading import waittonotthrow + + +class Kernel: + + def __init__(self, host): + self._host = host + + def version(self): + return self._host.ssh.run.script("uname -r") + + def is_debug(self): + return "debug" in self.version() + + def cpuinfo(self): + return libcpuinfo.CpuInfo(self._host.seed.runCallable(cpuinfo.cpuInfo)[0]) + + def rdmsr(self, register): + return self._host.ssh.run.script("rdmsr %(regnum)s" % dict(regnum=register)).strip() + + def modprobe(self, module, parameters=""): + try: + self._host.ssh.run.script("modprobe %(module)s %(parameters)s" % dict(module=module, parameters=parameters)) + except: + self._logDmesgOnModuleLoadFailure(module) + raise + + def removeKernelModuleIfLoaded(self, module): + if self.isModuleLoaded(module): + self.removeKernelModule(module) + + def removeKernelModule(self, module): + TIME_WAIT_FOR_RMMOD_TO_SUCCEEDD = 10 + waittonotthrow.WaitToNotThrow(timeout=TIME_WAIT_FOR_RMMOD_TO_SUCCEEDD).wait(lambda: self._host.ssh.run.script("rmmod %s" % module)) + + def isModuleLoaded(self, module): + output = self._host.ssh.run.script("lsmod") + return module in output.split() + + +plugins.register('kernel', Kernel) diff --git a/py/rackattack/dryrun/plugins/network.py b/py/rackattack/dryrun/plugins/network.py new file mode 100644 index 0000000..777f754 --- /dev/null +++ b/py/rackattack/dryrun/plugins/network.py @@ -0,0 +1,93 @@ +from strato.racktest.hostundertest import plugins +from rackattack.dryrun.seeds import network as seednetwork +import logging +import ipaddr +import time +from rackattack.dryrun.common import waitforpredicate +from strato.common.multithreading import subprocesswrappers + + +NETWORK_OFFSET = 10 + +SYSCONFIG_NETWORK_CONF = """DEVICE=%(device)s +HWADDR=%(mac)s +BOOTPROTO=static +IPADDR=%(ip)s +NETMASK=%(mask)s""" + + +class Network(object): + + def __init__(self, host): + logging.info("Initializing fast network on host %(host)s", dict(host=host.name)) + self._host = host + self._initMellanoxDevice() + self.networks = dict() + self._configureFastNetwork() + + def addTaggedDevice(self, vport, inetAddr): + untagedDeviceName = self.networks['untaged']['device'] + self._host.ssh.run.script("vconfig add %(deviceName)s %(vlanID)s" % dict(deviceName=untagedDeviceName, vlanID=vport)) + deviceName = "%(device)s.%(port)d" % dict(device=untagedDeviceName, port=vport) + self._host.seed.runCallable(seednetwork.configureStaticIPOnDevice, inetAddr, deviceName) + self.networks[vport] = dict(device=deviceName, ip=inetAddr.ip) + + def addTaggedDevices(self, vports): + for i, vport in enumerate(vports): + self.addTaggedDevice(vport, self._fastNetworkIpAddressFromMgmtIpAddress(i + 1)) + + def _fastNetworkIpAddressFromMgmtIpAddress(self, offset=0): + publicIpList = self._host.node.ipAddress().split('.') + publicIpList[2] = str(int(publicIpList[2]) + NETWORK_OFFSET + offset) + newIP = '.'.join(publicIpList) + netAddress = ipaddr.IPv4Network('%s/%d' % (newIP, 24)) + return netAddress + + def _configureFastNetwork(self): + try: + privateInterface = waitforpredicate.WaitForPredicate(timeout=40, interval=3).waitAndReturn(self._fastInterface) + except: + interfaces = self._host.seed.runCallable(seednetwork.interfaces)[0] + logging.exception("Failed to aquire fast interface on host %(host)s existing %(interfaces)s", + dict(host=self._host.name, interfaces=interfaces)) + raise + + inet = self._fastNetworkIpAddressFromMgmtIpAddress() + device = privateInterface[0] + mac = privateInterface[2] + logging.info("Adding ip address %(ip)s in host %(host)s device %(device)s mac %(mac)s" + % dict(ip=inet.ip, host=self._host.name, device=device, mac=mac)) + self.networks['untaged'] = dict(device=device, ip=inet.ip) + staticConfPath = '/etc/sysconfig/network-scripts/ifcfg-%(deviceName)s' % dict(deviceName=device) + self._host.ssh.ftp.putContents(staticConfPath, SYSCONFIG_NETWORK_CONF % + dict(device=device, ip=inet.ip, mac=mac, mask=inet.netmask)) + self._host.seed.runCallable(seednetwork.configureStaticIPOnDevice, inet, device) + + def _mellanoxPCICardID(self): + lspciLines = self._host.ssh.run.script("lspci").split('\n') + for line in lspciLines: + if 'Mellanox' in line: + if any(x in line for x in ['Network controller', 'Ethernet controller']): + return line.split(' ')[0] + return None + + def _initMellanoxDevice(self): + self._host.kernel.removeKernelModuleIfLoaded('mlx4_en') + self._host.kernel.removeKernelModuleIfLoaded('mlx4_core') + self._host.kernel.modprobe('mlx4_core', 'port_type_array=2,2') + self._host.kernel.modprobe('mlx4_en') + deviceName = waitforpredicate.WaitForPredicate(timeout=30, interval=3).waitAndReturn(self._mellanoxPCICardID) + self._host.kernel.modprobe('8021q') + self._host.ssh.run.script("/bin/echo eth > /sys/bus/pci/devices/0000:%(deviceName)s/mlx4_port1" + % dict(deviceName=deviceName)) + # Second port is not a must and in fact does not exists in bezeq cloud + self._host.ssh.run.script("/bin/echo eth > /sys/bus/pci/devices/0000:%(deviceName)s/mlx4_port2 || true" + % dict(deviceName=deviceName)) + + def _fastInterface(self): + interfaces = self._host.seed.runCallable(seednetwork.interfaces)[0] + if len(interfaces['fast']) > 0: + return interfaces['fast'][0] + return None + +plugins.register('network', Network) diff --git a/py/rackattack/dryrun/seeds/__init__.py b/py/rackattack/dryrun/seeds/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/py/rackattack/dryrun/seeds/cpuinfo.py b/py/rackattack/dryrun/seeds/cpuinfo.py new file mode 100644 index 0000000..b32a040 --- /dev/null +++ b/py/rackattack/dryrun/seeds/cpuinfo.py @@ -0,0 +1,5 @@ +import re + + +def cpuInfo(): + return open("/proc/cpuinfo").read() From 3e05b11316ef8ce232bb2c112d4edf8c2567ecb8 Mon Sep 17 00:00:00 2001 From: Alexander Solganik Date: Tue, 27 Jan 2015 08:31:12 +0200 Subject: [PATCH 07/27] adding eclipse projects to ignored --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 7b55d73..e8702d9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ *.pyc *.swp build +.project +.pydevproject From 539eb9b749f3521c76246530e33507ffe5521392 Mon Sep 17 00:00:00 2001 From: Alexander Solganik Date: Tue, 27 Jan 2015 08:31:42 +0200 Subject: [PATCH 08/27] add object to hold test results for every node for every test this object will aggregate test results and the log --- py/rackattack/dryrun/servertestresult.py | 27 ++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 py/rackattack/dryrun/servertestresult.py diff --git a/py/rackattack/dryrun/servertestresult.py b/py/rackattack/dryrun/servertestresult.py new file mode 100644 index 0000000..c4bf22c --- /dev/null +++ b/py/rackattack/dryrun/servertestresult.py @@ -0,0 +1,27 @@ +class ServerTestResult(dict): + + def __init__(self, serverId): + self['name'] = serverId + self['status'] = 'SUCCESS' + + def addCheck(self, checkCategory, checkName, checkStatus, checkLog='', extra=None): + categoryEntry = dict.setdefault(self, checkCategory, []) + categoryEntry.append((checkName, checkStatus, checkLog, extra)) + if not checkStatus: + self['status'] = 'FAIL' + + def failedChecks(self): + failedTests = [] + for category, categoryChecks in dict.items(self): + failedTests.extend([(category, check) for check in categoryChecks if check[1] is False]) + return failedTests + + def failedChecksByCategory(self, categoryName): + return [check for check in self[categoryName] if check[1] is False] + + def summary(self): + return {category: len(self.failedChecksByCategory(category)) == 0 + for category in self.keys() if category not in ['name', 'status']} + + def passed(self): + return self['status'] == 'SUCCESS' From 3ee966e57c13b662fa6ba8af9a7eb1aed78e28ca Mon Sep 17 00:00:00 2001 From: Alexander Solganik Date: Tue, 27 Jan 2015 08:32:25 +0200 Subject: [PATCH 09/27] main tester utility. this keeps all test logics --- py/rackattack/dryrun/healthchecher.py | 96 +++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 py/rackattack/dryrun/healthchecher.py diff --git a/py/rackattack/dryrun/healthchecher.py b/py/rackattack/dryrun/healthchecher.py new file mode 100644 index 0000000..827e7d9 --- /dev/null +++ b/py/rackattack/dryrun/healthchecher.py @@ -0,0 +1,96 @@ +import logging +from rackattack.dryrun.common import waitforpredicate +from strato.common.multithreading import concurrently +import pprint +import servertestresult +import threading + + +def _verifyVmxEnabledByBios(host, resultObject): + IA32_FEATURE_CONTROL = '0x3a' + VMXON_BIT = 2 + LOCK_BIT = 0 + VMX_ENABLED = ((1 << VMXON_BIT) | (1 << LOCK_BIT)) + regvalue = int(host.kernel.rdmsr(IA32_FEATURE_CONTROL)) + result = True + log = '' + if(regvalue & VMX_ENABLED) != VMX_ENABLED: + log = "VMX is not enabled in bios register val %(regvalue)" % dict(regvalue=hex(regvalue)) + result = False + resultObject.addCheck('virt', 'virtualization bios', result, log) + + +def _verifyVirtualizationEnabled(host, resultObject): + info = host.kernel.cpuinfo() + output = '' + result = True + if not info.hasVt(): + result = False + output = "Virtualization is not supported on %(hostname)s cpuninfo %(cpuinfo)s" % dict( + hostname=host.name, cpuinfo=pprint.pprint(info)) + resultObject.addCheck('virt', 'virtualization cpu support', result, output) + if info.hasFlag(0, 'vmx'): + _verifyVmxEnabledByBios(host, resultObject) + + +def _checkDisk(hostToCheck, resultObject): + result, output = hostToCheck.disk.smartctlStatus('/dev/sda') + resultObject.addCheck('disk', 'smartctl /dev/sda', result, output) + resultObject.addCheck('disk', 'SSD /dev/sda', not hostToCheck.disk.rotational('sda')) + + +def _pingScript(ip, deviceName): + return "ping -c 5 %(ip)s -I %(device)s" % dict(ip=ip, device=deviceName) + + +def _runPing(srcHost, dstHost, netName, testResult, lock): + ipDst = dstHost.network.networks[netName]['ip'] + srcDevice = srcHost.network.networks[netName]['device'] + logging.info("Pinging from host %(srchost)s to %(dstHost)s to ip %(ip)s from device %(srcDevice)s", dict( + srchost=srcHost.name, dstHost=dstHost.name, ip=ipDst, srcDevice=srcDevice)) + log = '' + result = True + pingScript = _pingScript(ipDst, srcDevice) + try: + srcHost.ssh.run.script(pingScript) + except: + result = False + log = "Failed pinging from host %(srchost)s to %(dstHost)s to ip %(ip)s" % dict( + srchost=srcHost.name, dstHost=dstHost.name, ip=ipDst) + logging.exception(log) + lock.acquire() + testResult.addCheck('net', 'ping on %(netName)s from %(src)s to %(dest)s "%(script)s"' % + dict(netName=netName, src=srcHost.name, dest=dstHost.name, script=pingScript), + result, log) + lock.release() + + +def _checkNetwork(node1, node2, vlanTags, testResult, lock): + for netName in ['untaged'] + vlanTags: + logging.info("Checking '%(network)s' network between %(node1)s and %(node2)s", + dict(network=str(netName), node1=node1.name, node2=node2.name)) + _runPing(node1, node2, netName, testResult, lock) + _runPing(node2, node1, netName, testResult, lock) + + +def checkServer(serverToCheck, serversToCheckNetwork, testResult, vlanTags): + _verifyVirtualizationEnabled(serverToCheck, testResult) + _checkDisk(serverToCheck, testResult) + lock = threading.Lock() + jobs = {server.name: (_checkNetwork, server, serverToCheck, vlanTags, testResult, lock) + for server in serversToCheckNetwork} + concurrently.run(jobs) + logging.info('Checking server %(server)s done result %(summary)s', + dict(server=serverToCheck.name, summary=testResult.summary())) + return testResult + + +def _partnerServer(masterHost, serversToCheck, serverToCheck): + return [masterHost] + [server for server in serversToCheck if server is not serverToCheck] + + +def checkServers(masterHost, hostsResultsMap, vlanTags): + serversToCheck = hostsResultsMap.keys() + jobs = {server.name: (checkServer, server, _partnerServer(masterHost, serversToCheck, server), testResult, vlanTags) + for server, testResult in hostsResultsMap.items()} + concurrently.run(jobs) From ba372fc4207f494e6bc5053793066be8b4390052 Mon Sep 17 00:00:00 2001 From: Alexander Solganik Date: Tue, 27 Jan 2015 08:32:52 +0200 Subject: [PATCH 10/27] utility entry point responsible for machines bringuo before test and displaying the results --- py/rackattack/dryrun/main.py | 176 +++++++++++++++++++++++++++-------- 1 file changed, 139 insertions(+), 37 deletions(-) diff --git a/py/rackattack/dryrun/main.py b/py/rackattack/dryrun/main.py index 8d5529c..0dc3671 100644 --- a/py/rackattack/dryrun/main.py +++ b/py/rackattack/dryrun/main.py @@ -1,57 +1,159 @@ import logging -logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') +import paramiko +from strato.common.log import configurelogging +import pprint +import sys +from rackattack.dryrun import servertestresult +import traceback +configurelogging.configureLogging('dryrun') import yaml import argparse -from rackattack import clientfactory from rackattack import api from rackattack.ssh import connection import subprocess import socket import time +import healthchecher +from rackattack import clientfactory +from rackattack.physical import ipmi +from plugins import kernel +from plugins import disk +from plugins import network +from strato.racktest.hostundertest import host +from rackattack.dryrun import dryrunhost +from rackattack.dryrun import node +from rackattack.dryrun.seeds import innaugurator +from strato.common.multithreading import concurrently + parser = argparse.ArgumentParser() parser.add_argument("--rackYaml", required=True) -parser.add_argument("--targetNode", required=True) parser.add_argument("--rackattackUser", required=True) -parser.add_argument("--ipAddress", required=True) parser.add_argument("--osmosisServerIP", required=True) +parser.add_argument("--ipAddress", required=True, action='append') +parser.add_argument("--targetNode", required=True, action='append') +parser.add_argument("--vlan", action='append', default=[], type=int) +parser.add_argument("--debug", default=False, type=bool) + args = parser.parse_args() + +def allocateMasterHost(rackuser, label): + client = clientfactory.factory() + logging.info("Allocating master node") + allocationInfo = api.AllocationInfo(user=rackuser, purpose="dryrun") + requirements = dict(master=api.Requirement(imageLabel=label, imageHint="rootfs-basic")) + allocation = client.allocate(requirements, allocationInfo) + allocation.wait(timeout=5 * 60) + logging.info("Allocation successful, waiting for ssh") + masterHost = host.Host(allocation.nodes()['master'], 'master') + masterHost.ssh.waitForTCPServer() + masterHost.ssh.connect() + return masterHost + + +def _allocateTestNodes(masterHost, hostsToInnagurate): + innaguratedHosts = [] + logging.info("Going to innagurate %(servers)d servers...be patient", dict(servers=len(hostsToInnagurate))) + failedNodes, log = masterHost.seed.runCallable(innaugurator.innaugurate, + osmosisServerIP=args.osmosisServerIP, + rootfsLabel=label, + nodesToInnagurate=hostsToInnagurate) + if len(failedNodes) > 0: + logging.error("Failed to innagurate %(nodes)d nodes log %(log)s", dict(nodes=len(failedNodes), log=log)) + for hostId, host in enumerate(hostsToInnagurate): + if host['hostID'] in failedNodes: + continue + allocatedNode = node.Node(host['hostID'], masterHost, host['macAddress'], host['ipAddress'], hostId) + hostToCheck = dryrunhost.DryRunHost(allocatedNode, dict(username='root', password='dryrun')) + hostToCheck.ssh.connect() + logging.info("Sucessfully connected to node %(node)s", dict(node=hostToCheck.name)) + innaguratedHosts.append(hostToCheck) + return (innaguratedHosts, failedNodes) + + +def printServerResults(results): + pp = pprint.PrettyPrinter(indent=4) + pp.pprint(results) + + +def printHostsThatFailedInnaguration(failedHosts): + for hostID, log in failedHosts.items(): + logging.error('Host %(host)s failed innauguration serial log %(log)s', dict(host=hostID, log=log)) + + +def _initializeFastNetworkOnHost(hostToInitialize, vtags, testResult): + try: + logging.info("Init Fast network in host %(host)s", dict(host=hostToInitialize.name)) + hostToInitialize.network.addTaggedDevices(vtags) + testResult.addCheck('net', 'init fast net', True) + return True + except: + testResult.addCheck('net', 'init fast net', False, traceback.format_exc()) + return False + + +def _initializeFastNetworkOnTestHosts(hostsMap, vtags): + jobs = {host: (_initializeFastNetworkOnHost, host, vtags, testResult) + for host, testResult in hostsMap.items()} + results = concurrently.run(jobs) + + initializedHosts = {resultHost: hostsMap[resultHost] for resultHost, result in results.items() if result} + return initializedHosts + with open(args.rackYaml) as f: rackYaml = yaml.load(f) -targetNode = [n for n in rackYaml['HOSTS'] if n['id'] == args.targetNode][0] -client = clientfactory.factory() -logging.info("Allocating master node") -allocationInfo = api.AllocationInfo(user=args.rackattackUser, purpose="dryrun") + +vtags = args.vlan label = subprocess.check_output(["solvent", "printlabel", "--thisProject", "--product=rootfs"]).strip() -requirements = dict(master=api.Requirement(imageLabel=label, imageHint="rootfs-basic")) -allocation = client.allocate(requirements, allocationInfo) -allocation.wait(timeout=5 * 60) -logging.info("Allocation successful, waiting for ssh") -masterNode = allocation.nodes()['master'] -ssh = connection.Connection(**masterNode.rootSSHCredentials()) -ssh.waitForTCPServer() -ssh.connect() -logging.info("Connected to ssh") -ssh.ftp.putFile("/tmp/master.egg", "build/master.egg") +masterHost = allocateMasterHost(args.rackattackUser, label) +masterHost.network.addTaggedDevices(vtags) + +targetNodes = [n for n in rackYaml['HOSTS'] if n['id'] in args.targetNode] +hostsToInnagurate = [] +assert len(targetNodes) == len(args.ipAddress), "Amount of target nodes must be the same as IP`s" +for targetNode, ipAddress in zip(targetNodes, args.ipAddress): + ipmiHost = socket.gethostbyname(targetNode['ipmiLogin']['hostname']) + ipmiUsername = targetNode['ipmiLogin']['username'] + ipmiPassword = targetNode['ipmiLogin']['password'] + macAddress = targetNode['primaryMAC'] + hostsToInnagurate.append(dict(hostID=targetNode['id'], + macAddress=macAddress, + ipAddress=ipAddress, + ipmiHost=ipmiHost, + ipmiUsername=ipmiUsername, + ipmiPassword=ipmiPassword)) +innaguratedHosts = [] +testResults = [] +exitCode = -1 try: - print ssh.run.script( - "PYTHONPATH=/tmp/master.egg " - "strace -fF -o /tmp/trace " - "python -m rackattack.dryrun.master.main " - "--hostID=%(targetNodeID)s --macAddress=%(macAddress)s " - "--ipmiHost=%(ipmiHost)s --ipmiUsername=%(ipmiUsername)s " - "--ipmiPassword=%(ipmiPassword)s --osmosisServerIP=%(osmosisServerIP)s " - "--ipAddress=%(ipAddress)s --label=%(label)s" % dict( - targetNodeID=targetNode['id'], - macAddress=targetNode['primaryMAC'], - ipmiHost=socket.gethostbyname(targetNode['ipmiLogin']['hostname']), - ipmiUsername=targetNode['ipmiLogin']['username'], - ipmiPassword=targetNode['ipmiLogin']['password'], - osmosisServerIP=args.osmosisServerIP, - ipAddress=args.ipAddress, - label=label)) + innaguratedHosts, failedHosts = _allocateTestNodes(masterHost, hostsToInnagurate) + for failedHost, log in failedHosts.items(): + testResult = servertestresult.ServerTestResult(failedHost) + testResult.addCheck('init', 'innaugarate', False, log) + testResults.append(testResult) + + logging.info('Going to test servers %(names)s', + dict(names=' '.join([innaguratedHost.name for innaguratedHost in innaguratedHosts]))) + + hostsResultsMap = {innaguratedHost: servertestresult.ServerTestResult(innaguratedHost.name) + for innaguratedHost in innaguratedHosts} + testResults.extend(hostsResultsMap.values()) + hostsToRunCheckOnMap = _initializeFastNetworkOnTestHosts(hostsResultsMap, vtags) + if len(hostsToRunCheckOnMap) > 0: + healthchecher.checkServers(masterHost, hostsToRunCheckOnMap, vtags) + exitCode = 0 if len([testResult for testResult in testResults if not testResult.passed()]) == 0 else -1 except: - import traceback - traceback.print_exc() - time.sleep(1000000) + printServerResults(testResults) + logging.exception("Failed running test script") +finally: + if args.debug: + import ipdb + ipdb.set_trace() + if len(innaguratedHosts) > 0: + logging.info("Powering hosts off") + jobs = {innaguratedHost.name: (ipmi.IPMI(ipmiHost, ipmiUsername, ipmiPassword)._powerCommand, 'off') + for innaguratedHost in innaguratedHosts} + concurrently.run(jobs) + logging.info('PASSED' if exitCode == 0 else 'FAILED') + sys.exit(exitCode) From 0ba2c416e66915852e88ef2bd6ceac4d59d188d6 Mon Sep 17 00:00:00 2001 From: Alexander Solganik Date: Tue, 27 Jan 2015 12:04:22 +0200 Subject: [PATCH 11/27] increase max ssh connections for parallel testing --- Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile b/Makefile index 56818aa..f60c600 100644 --- a/Makefile +++ b/Makefile @@ -41,6 +41,8 @@ $(ROOTFS): build/smartctl sudo chroot $(ROOTFS).tmp yum install --assumeyes $(YUMCACHE)/mirrors.kernel.org/fedora-epel/7/x86_64/v/vconfig-1.9-16.el7.x86_64.rpm sudo chroot $(ROOTFS).tmp pip install rpdb sudo chroot $(ROOTFS).tmp pip install ipaddr + sudo sh -c "echo 'MaxSessions 300' >> $(ROOTFS).tmp/etc/ssh/sshd_config" + sudo sh -c "echo 'UseDNS no' >> $(ROOTFS).tmp/etc/ssh/sshd_config" sudo cp $< $(ROOTFS).tmp/usr/sbin/ sudo rm -fr $(ROOTFS).tmp/tmp/* sudo mv $(ROOTFS).tmp $(ROOTFS) From d8195bedf107db452a7a65be5c05f7706f83db3f Mon Sep 17 00:00:00 2001 From: Alexander Solganik Date: Tue, 27 Jan 2015 12:16:49 +0200 Subject: [PATCH 12/27] dump dmesg when failing to load modules for some reason --- py/rackattack/dryrun/plugins/kernel.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/py/rackattack/dryrun/plugins/kernel.py b/py/rackattack/dryrun/plugins/kernel.py index 5f06557..d4e24a7 100644 --- a/py/rackattack/dryrun/plugins/kernel.py +++ b/py/rackattack/dryrun/plugins/kernel.py @@ -2,6 +2,7 @@ from rackattack.dryrun.seeds import cpuinfo from rackattack.dryrun.lib import cpuinfo as libcpuinfo from strato.common.multithreading import waittonotthrow +import logging class Kernel: @@ -40,5 +41,12 @@ def isModuleLoaded(self, module): output = self._host.ssh.run.script("lsmod") return module in output.split() + def _logDmesgOnModuleLoadFailure(self, moduleName): + # We assume that if we fail here it is because of some module dependencies, lets log last lines from dmesg + try: + logging.error('Failed to modprobe module %(module)s dmesg: %(dmesg)s', dict(module=moduleName, dmesg=self.dmesg(30))) + except: + pass # If fail on dmesg, ignore nothing we can do about it + plugins.register('kernel', Kernel) From 224601cd318a4a63592f5e6a037b052bf0429852 Mon Sep 17 00:00:00 2001 From: Alexander Solganik Date: Tue, 27 Jan 2015 13:41:27 +0200 Subject: [PATCH 13/27] network initialize: split initialization from ctor and be more verbose be more verbose about network failures during initialization --- py/rackattack/dryrun/main.py | 25 +++++++++++++++++++++++-- py/rackattack/dryrun/plugins/network.py | 21 +++++++++++++++++---- py/rackattack/dryrun/seeds/network.py | 5 +++++ 3 files changed, 45 insertions(+), 6 deletions(-) diff --git a/py/rackattack/dryrun/main.py b/py/rackattack/dryrun/main.py index 0dc3671..7de2841 100644 --- a/py/rackattack/dryrun/main.py +++ b/py/rackattack/dryrun/main.py @@ -83,13 +83,33 @@ def printHostsThatFailedInnaguration(failedHosts): def _initializeFastNetworkOnHost(hostToInitialize, vtags, testResult): + logging.info("Init Fast network in host %(host)s", dict(host=hostToInitialize.name)) + try: + hostToInitialize.network.initialize() + except: + logging.exception("Failed to initialize network") + pciIdCard = hostToInitialize.network.mellanoxPCIId() + ethtoolResult = hostToInitialize.network.ethtool() + lspciOutput = hostToInitialize.ssh.run.script("lspci") + lsmodOutput = hostToInitialize.ssh.run.script("lsmod") + if pciIdCard is None: + testResult.addCheck('net', 'init fast net', False, "Mellanox Card is not identified lspci %(lspci)s lsmod %(lsmod)s" + % dict(lspci=lspciOutput, lsmod=lsmodOutput)) + elif hostToInitialize.network.fastInterface() is None: + testResult.addCheck('net', 'init fast net ', False, "Link is not connected on Mellanox %(ethtool)s" + % dict(ethtool=ethtoolResult)) + else: + testResult.addCheck('net', 'init fast net ', False, "Unknown problem lspci %(lspci)s lsmod %(lsmod)s %(ethtool)s" + % dict(lspci=lspciOutput, lsmod=lsmodOutput, ethtool=ethtoolResult)) + return False try: - logging.info("Init Fast network in host %(host)s", dict(host=hostToInitialize.name)) hostToInitialize.network.addTaggedDevices(vtags) testResult.addCheck('net', 'init fast net', True) return True except: - testResult.addCheck('net', 'init fast net', False, traceback.format_exc()) + logging.exception("Failed to Add vtags") + ifcfgOutput = hostToInitialize.network.ifconfig() + testResult.addCheck('net', 'init fast net', False, "Failed to add Vports ifcfg %(ifcfg)s" % dict(ifcfg=ifcfgOutput)) return False @@ -107,6 +127,7 @@ def _initializeFastNetworkOnTestHosts(hostsMap, vtags): vtags = args.vlan label = subprocess.check_output(["solvent", "printlabel", "--thisProject", "--product=rootfs"]).strip() masterHost = allocateMasterHost(args.rackattackUser, label) +masterHost.network.initialize() masterHost.network.addTaggedDevices(vtags) targetNodes = [n for n in rackYaml['HOSTS'] if n['id'] in args.targetNode] diff --git a/py/rackattack/dryrun/plugins/network.py b/py/rackattack/dryrun/plugins/network.py index 777f754..11051c4 100644 --- a/py/rackattack/dryrun/plugins/network.py +++ b/py/rackattack/dryrun/plugins/network.py @@ -19,10 +19,13 @@ class Network(object): def __init__(self, host): - logging.info("Initializing fast network on host %(host)s", dict(host=host.name)) self._host = host - self._initMellanoxDevice() + self._mellanixPCIId = None self.networks = dict() + + def initialize(self): + logging.info("Initializing fast network on host %(host)s", dict(host=self._host.name)) + self._initMellanoxDevice() self._configureFastNetwork() def addTaggedDevice(self, vport, inetAddr): @@ -45,7 +48,7 @@ def _fastNetworkIpAddressFromMgmtIpAddress(self, offset=0): def _configureFastNetwork(self): try: - privateInterface = waitforpredicate.WaitForPredicate(timeout=40, interval=3).waitAndReturn(self._fastInterface) + privateInterface = waitforpredicate.WaitForPredicate(timeout=40, interval=3).waitAndReturn(self.fastInterface) except: interfaces = self._host.seed.runCallable(seednetwork.interfaces)[0] logging.exception("Failed to aquire fast interface on host %(host)s existing %(interfaces)s", @@ -77,6 +80,7 @@ def _initMellanoxDevice(self): self._host.kernel.modprobe('mlx4_core', 'port_type_array=2,2') self._host.kernel.modprobe('mlx4_en') deviceName = waitforpredicate.WaitForPredicate(timeout=30, interval=3).waitAndReturn(self._mellanoxPCICardID) + self._mellanixPCIId = deviceName self._host.kernel.modprobe('8021q') self._host.ssh.run.script("/bin/echo eth > /sys/bus/pci/devices/0000:%(deviceName)s/mlx4_port1" % dict(deviceName=deviceName)) @@ -84,10 +88,19 @@ def _initMellanoxDevice(self): self._host.ssh.run.script("/bin/echo eth > /sys/bus/pci/devices/0000:%(deviceName)s/mlx4_port2 || true" % dict(deviceName=deviceName)) - def _fastInterface(self): + def mellanoxPCIId(self): + return self._mellanixPCIId + + def fastInterface(self): interfaces = self._host.seed.runCallable(seednetwork.interfaces)[0] if len(interfaces['fast']) > 0: return interfaces['fast'][0] return None + def ethtool(self): + return self._host.seed.runCallable(seednetwork.ethtool)[0] + + def ifconfig(self): + self._host.ssh.run.script("ifconfig -a -v") + plugins.register('network', Network) diff --git a/py/rackattack/dryrun/seeds/network.py b/py/rackattack/dryrun/seeds/network.py index c2cbc1b..92ca708 100644 --- a/py/rackattack/dryrun/seeds/network.py +++ b/py/rackattack/dryrun/seeds/network.py @@ -30,6 +30,11 @@ def interfaces(): return nicsBySpeed +def ethtool(): + return {nic: _exec('ethtool %s' % nic) for nic in netifaces.interfaces() + if nic.startswith('e') or nic.startswith('p')} + + def myIP(): s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) try: From 1b1d832fc47964251d0c3ca10e384cdcbee57f18 Mon Sep 17 00:00:00 2001 From: Alexander Solganik Date: Tue, 27 Jan 2015 13:58:59 +0200 Subject: [PATCH 14/27] force boot from pxe during innauguration --- py/rackattack/dryrun/seeds/innaugurator.py | 1 + 1 file changed, 1 insertion(+) diff --git a/py/rackattack/dryrun/seeds/innaugurator.py b/py/rackattack/dryrun/seeds/innaugurator.py index c9cb404..61f1f77 100644 --- a/py/rackattack/dryrun/seeds/innaugurator.py +++ b/py/rackattack/dryrun/seeds/innaugurator.py @@ -130,6 +130,7 @@ def innaugurate(osmosisServerIP, rootfsLabel, nodesToInnagurate): ipmiInstance = ipmi.IPMI(nodeToInnaugurate['ipmiHost'], nodeToInnaugurate['ipmiUsername'], nodeToInnaugurate['ipmiPassword']) + ipmiInstance.forceBootFrom('pxe') ipmiInstance.powerCycle() logging.info("Waiting for inaugurator to check in") From 4113be3a7bcf5c31337b55ddac3d566d6af22f19 Mon Sep 17 00:00:00 2001 From: Alexander Solganik Date: Tue, 27 Jan 2015 14:35:36 +0200 Subject: [PATCH 15/27] limit amount of threads --- py/rackattack/dryrun/healthchecher.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/py/rackattack/dryrun/healthchecher.py b/py/rackattack/dryrun/healthchecher.py index 827e7d9..0fb55a8 100644 --- a/py/rackattack/dryrun/healthchecher.py +++ b/py/rackattack/dryrun/healthchecher.py @@ -79,7 +79,7 @@ def checkServer(serverToCheck, serversToCheckNetwork, testResult, vlanTags): lock = threading.Lock() jobs = {server.name: (_checkNetwork, server, serverToCheck, vlanTags, testResult, lock) for server in serversToCheckNetwork} - concurrently.run(jobs) + concurrently.run(jobs, numberOfThreads=10) logging.info('Checking server %(server)s done result %(summary)s', dict(server=serverToCheck.name, summary=testResult.summary())) return testResult @@ -93,4 +93,4 @@ def checkServers(masterHost, hostsResultsMap, vlanTags): serversToCheck = hostsResultsMap.keys() jobs = {server.name: (checkServer, server, _partnerServer(masterHost, serversToCheck, server), testResult, vlanTags) for server, testResult in hostsResultsMap.items()} - concurrently.run(jobs) + concurrently.run(jobs, numberOfThreads=10) From 4440488e5186e44e2966ce62a14142286da9bed6 Mon Sep 17 00:00:00 2001 From: Alexander Solganik Date: Tue, 27 Jan 2015 14:45:57 +0200 Subject: [PATCH 16/27] pretty print ethtool results --- py/rackattack/dryrun/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/py/rackattack/dryrun/main.py b/py/rackattack/dryrun/main.py index 7de2841..6c76822 100644 --- a/py/rackattack/dryrun/main.py +++ b/py/rackattack/dryrun/main.py @@ -97,10 +97,10 @@ def _initializeFastNetworkOnHost(hostToInitialize, vtags, testResult): % dict(lspci=lspciOutput, lsmod=lsmodOutput)) elif hostToInitialize.network.fastInterface() is None: testResult.addCheck('net', 'init fast net ', False, "Link is not connected on Mellanox %(ethtool)s" - % dict(ethtool=ethtoolResult)) + % dict(ethtool=pprint.PrettyPrinter(indent=4).pformat(ethtoolResult))) else: testResult.addCheck('net', 'init fast net ', False, "Unknown problem lspci %(lspci)s lsmod %(lsmod)s %(ethtool)s" - % dict(lspci=lspciOutput, lsmod=lsmodOutput, ethtool=ethtoolResult)) + % dict(lspci=lspciOutput, lsmod=lsmodOutput, ethtool=pprint.PrettyPrinter(indent=4).pformat(ethtoolResult))) return False try: hostToInitialize.network.addTaggedDevices(vtags) From 4f1dedc5e9aceefaa9f6bf32872735cc4bebf959 Mon Sep 17 00:00:00 2001 From: Alexander Solganik Date: Tue, 27 Jan 2015 15:52:14 +0200 Subject: [PATCH 17/27] more detailed printing and less logging --- py/rackattack/dryrun/healthchecher.py | 34 +++++++++++++-------------- py/rackattack/dryrun/main.py | 13 ++++++++-- 2 files changed, 27 insertions(+), 20 deletions(-) diff --git a/py/rackattack/dryrun/healthchecher.py b/py/rackattack/dryrun/healthchecher.py index 0fb55a8..7137576 100644 --- a/py/rackattack/dryrun/healthchecher.py +++ b/py/rackattack/dryrun/healthchecher.py @@ -4,6 +4,7 @@ import pprint import servertestresult import threading +import sys def _verifyVmxEnabledByBios(host, resultObject): @@ -40,48 +41,45 @@ def _checkDisk(hostToCheck, resultObject): def _pingScript(ip, deviceName): - return "ping -c 5 %(ip)s -I %(device)s" % dict(ip=ip, device=deviceName) + return "ping -c 2 %(ip)s -I %(device)s" % dict(ip=ip, device=deviceName) def _runPing(srcHost, dstHost, netName, testResult, lock): ipDst = dstHost.network.networks[netName]['ip'] srcDevice = srcHost.network.networks[netName]['device'] - logging.info("Pinging from host %(srchost)s to %(dstHost)s to ip %(ip)s from device %(srcDevice)s", dict( - srchost=srcHost.name, dstHost=dstHost.name, ip=ipDst, srcDevice=srcDevice)) log = '' - result = True pingScript = _pingScript(ipDst, srcDevice) try: srcHost.ssh.run.script(pingScript) + lock.acquire() + testResult.addCheck('net', 'ping on %(netName)s from %(src)s to %(dest)s "%(script)s"' % + dict(netName=netName, src=srcHost.name, dest=dstHost.name, script=pingScript), + True, '', (netName, srcHost.name, dstHost.name)) + lock.release() except: - result = False log = "Failed pinging from host %(srchost)s to %(dstHost)s to ip %(ip)s" % dict( srchost=srcHost.name, dstHost=dstHost.name, ip=ipDst) - logging.exception(log) lock.acquire() - testResult.addCheck('net', 'ping on %(netName)s from %(src)s to %(dest)s "%(script)s"' % - dict(netName=netName, src=srcHost.name, dest=dstHost.name, script=pingScript), - result, log) + testResult.addCheck('net', 'ping on %(netName)s from %(src)s to %(dest)s "%(script)s" exception %(exception)s' % + dict(netName=netName, src=srcHost.name, dest=dstHost.name, script=pingScript, exception=sys.exc_info()[1].message), + False, log, (netName, srcHost.name, dstHost.name)) lock.release() def _checkNetwork(node1, node2, vlanTags, testResult, lock): for netName in ['untaged'] + vlanTags: - logging.info("Checking '%(network)s' network between %(node1)s and %(node2)s", - dict(network=str(netName), node1=node1.name, node2=node2.name)) _runPing(node1, node2, netName, testResult, lock) _runPing(node2, node1, netName, testResult, lock) def checkServer(serverToCheck, serversToCheckNetwork, testResult, vlanTags): + logging.info("Going to check %(server)s", dict(server=serverToCheck.name)) _verifyVirtualizationEnabled(serverToCheck, testResult) _checkDisk(serverToCheck, testResult) lock = threading.Lock() jobs = {server.name: (_checkNetwork, server, serverToCheck, vlanTags, testResult, lock) for server in serversToCheckNetwork} - concurrently.run(jobs, numberOfThreads=10) - logging.info('Checking server %(server)s done result %(summary)s', - dict(server=serverToCheck.name, summary=testResult.summary())) + concurrently.run(jobs, numberOfThreads=30) return testResult @@ -89,8 +87,8 @@ def _partnerServer(masterHost, serversToCheck, serverToCheck): return [masterHost] + [server for server in serversToCheck if server is not serverToCheck] -def checkServers(masterHost, hostsResultsMap, vlanTags): - serversToCheck = hostsResultsMap.keys() - jobs = {server.name: (checkServer, server, _partnerServer(masterHost, serversToCheck, server), testResult, vlanTags) - for server, testResult in hostsResultsMap.items()} +def checkServers(masterHost, hostsToProced, vlanTags): + allHosts = [host['host'] for host in hostsToProced] + jobs = {host['name']: (checkServer, host['host'], _partnerServer(masterHost, allHosts, host['host']), host['result'], vlanTags) + for host in hostsToProced} concurrently.run(jobs, numberOfThreads=10) diff --git a/py/rackattack/dryrun/main.py b/py/rackattack/dryrun/main.py index 6c76822..6e7f7d2 100644 --- a/py/rackattack/dryrun/main.py +++ b/py/rackattack/dryrun/main.py @@ -73,8 +73,17 @@ def _allocateTestNodes(masterHost, hostsToInnagurate): def printServerResults(results): + passedServers = [result for result in results if result.passed()] + failedServers = [result for result in result if not result.passed()] + print "TOTALLY %d PASSED %d FAILED" % (len(passedServers), len(failedServers)) + + print "*********************FAILED SERVERS*******************************" pp = pprint.PrettyPrinter(indent=4) - pp.pprint(results) + for server in failedServers: + pp.pprint("%(name) - %(summary)s" % dict(name=server['name'], summary=server['summary'])) + print "*********************FAILED SERVERS DETAILS*******************************" + pp.pprint(failedServers) + print "*********************FAILED SERVERS DETAILS*******************************" def printHostsThatFailedInnaguration(failedHosts): @@ -165,9 +174,9 @@ def _initializeFastNetworkOnTestHosts(hostsMap, vtags): healthchecher.checkServers(masterHost, hostsToRunCheckOnMap, vtags) exitCode = 0 if len([testResult for testResult in testResults if not testResult.passed()]) == 0 else -1 except: - printServerResults(testResults) logging.exception("Failed running test script") finally: + printServerResults(testResults) if args.debug: import ipdb ipdb.set_trace() From fa8a9dab572a3c92473bfe603e22dfeffc01e57e Mon Sep 17 00:00:00 2001 From: Alexander Solganik Date: Tue, 27 Jan 2015 16:27:57 +0200 Subject: [PATCH 18/27] more verbose --- py/rackattack/dryrun/main.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/py/rackattack/dryrun/main.py b/py/rackattack/dryrun/main.py index 6e7f7d2..6d8d6b9 100644 --- a/py/rackattack/dryrun/main.py +++ b/py/rackattack/dryrun/main.py @@ -133,15 +133,16 @@ def _initializeFastNetworkOnTestHosts(hostsMap, vtags): with open(args.rackYaml) as f: rackYaml = yaml.load(f) +targetNodes = [n for n in rackYaml['HOSTS'] if n['id'] in args.targetNode] +assert len(targetNodes) == len(args.ipAddress), "Amount of target nodes must be the same as IP`s %d != %d" % (len(targetNodes), len(args.ipAddress)) + vtags = args.vlan label = subprocess.check_output(["solvent", "printlabel", "--thisProject", "--product=rootfs"]).strip() masterHost = allocateMasterHost(args.rackattackUser, label) masterHost.network.initialize() masterHost.network.addTaggedDevices(vtags) - -targetNodes = [n for n in rackYaml['HOSTS'] if n['id'] in args.targetNode] hostsToInnagurate = [] -assert len(targetNodes) == len(args.ipAddress), "Amount of target nodes must be the same as IP`s" + for targetNode, ipAddress in zip(targetNodes, args.ipAddress): ipmiHost = socket.gethostbyname(targetNode['ipmiLogin']['hostname']) ipmiUsername = targetNode['ipmiLogin']['username'] From 41508fa8daa662445bd139749151596307853aa9 Mon Sep 17 00:00:00 2001 From: Alexander Solganik Date: Tue, 27 Jan 2015 17:00:15 +0200 Subject: [PATCH 19/27] use first node ip as range --- py/rackattack/dryrun/seeds/innaugurator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/py/rackattack/dryrun/seeds/innaugurator.py b/py/rackattack/dryrun/seeds/innaugurator.py index 61f1f77..d84971b 100644 --- a/py/rackattack/dryrun/seeds/innaugurator.py +++ b/py/rackattack/dryrun/seeds/innaugurator.py @@ -103,8 +103,8 @@ def innaugurate(osmosisServerIP, rootfsLabel, nodesToInnagurate): tftpboot=tftpbootInstance, serverIP=network.myIP(), netmask=network.netmask(), - firstIP=network.myIP(), - lastIP=network.myIP(), + firstIP=nodesToInnagurate[0]['ipAddress'], + lastIP=nodesToInnagurate[0]['ipAddress'], gateway=network.gateway(), nameserver=network.myIP()) From adeedaf81653b58df40d4eab7434bc417940c435 Mon Sep 17 00:00:00 2001 From: Alexander Solganik Date: Tue, 27 Jan 2015 17:42:41 +0200 Subject: [PATCH 20/27] increase innaugurator wait since we test large amount of servers --- py/rackattack/dryrun/seeds/innaugurator.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/py/rackattack/dryrun/seeds/innaugurator.py b/py/rackattack/dryrun/seeds/innaugurator.py index d84971b..ef12371 100644 --- a/py/rackattack/dryrun/seeds/innaugurator.py +++ b/py/rackattack/dryrun/seeds/innaugurator.py @@ -134,12 +134,17 @@ def innaugurate(osmosisServerIP, rootfsLabel, nodesToInnagurate): ipmiInstance.powerCycle() logging.info("Waiting for inaugurator to check in") - remainingNodes = checkinWaiters.waitAll(timeout=6 * 60) - logging.error("Failed to checkin nodes %(nodes)s", dict(nodes=remainingNodes)) - remainingNodes = doneWaiters.waitAll(timeout=7 * 60) - logging.error("Failed to finish nodes %(nodes)s", dict(nodes=remainingNodes)) - - nodesToWaitForIp = [node for node in nodesToInnagurate if node not in remainingNodes] + failedNodesList = [] + failedToCheckinNodes = checkinWaiters.waitAll(timeout=10 * 60) + logging.error("Failed to checkin nodes %(nodes)s", dict(nodes=failedToCheckinNodes)) + for nodeNotToWaitDone in failedToCheckinNodes: + doneWaiters.notifyOne(nodeNotToWaitDone) + notDoneNodes = doneWaiters.waitAll(timeout=10 * 60) + failedNodesList.extend(failedToCheckinNodes) + failedNodesList.extend(notDoneNodes) + logging.error("Failed to finish nodes %(nodes)s", dict(nodes=notDoneNodes)) + + nodesToWaitForIp = [node for node in nodesToInnagurate if node not in notDoneNodes] # Now wait for all servers to obtain an IP for nodeToInnaugurate in nodesToWaitForIp: try: @@ -147,8 +152,8 @@ def innaugurate(osmosisServerIP, rootfsLabel, nodesToInnagurate): except: logging.exception("Failed to wait for active ssh connection on %(node)s", dict(node=nodesToInnagurate['hostID'])) - remainingNodes.append(nodeToInnaugurate) + failedNodesList.append(nodeToInnaugurate) failedNodes = {node['hostID']: open(solReaders[node['macAddress']].serialLogFilename()).read() - for node in remainingNodes} + for node in failedNodesList} return failedNodes From d44163b8ce7fa412d6c74b909cec16e1b1057f7c Mon Sep 17 00:00:00 2001 From: Alexander Solganik Date: Tue, 27 Jan 2015 17:43:02 +0200 Subject: [PATCH 21/27] fix print errors --- py/rackattack/dryrun/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/py/rackattack/dryrun/main.py b/py/rackattack/dryrun/main.py index 6d8d6b9..df591bf 100644 --- a/py/rackattack/dryrun/main.py +++ b/py/rackattack/dryrun/main.py @@ -74,7 +74,7 @@ def _allocateTestNodes(masterHost, hostsToInnagurate): def printServerResults(results): passedServers = [result for result in results if result.passed()] - failedServers = [result for result in result if not result.passed()] + failedServers = [result for result in results if not result.passed()] print "TOTALLY %d PASSED %d FAILED" % (len(passedServers), len(failedServers)) print "*********************FAILED SERVERS*******************************" From 4ea232fce908a05224d43e85584a3ff1e15ad6f3 Mon Sep 17 00:00:00 2001 From: Alexander Solganik Date: Wed, 28 Jan 2015 14:49:53 +0200 Subject: [PATCH 22/27] deeper analysys of results store logs add netgraph analysys --- Makefile | 1 + py/rackattack/dryrun/main.py | 196 +++++++++++++++------ py/rackattack/dryrun/plugins/logplugin.py | 46 +++++ py/rackattack/dryrun/seeds/innaugurator.py | 71 +++++--- 4 files changed, 240 insertions(+), 74 deletions(-) create mode 100644 py/rackattack/dryrun/plugins/logplugin.py diff --git a/Makefile b/Makefile index f60c600..4c2a812 100644 --- a/Makefile +++ b/Makefile @@ -39,6 +39,7 @@ $(ROOTFS): build/smartctl sudo chroot $(ROOTFS).tmp easy_install /tmp/inaugurator-1.0-py2.7.egg sudo chroot $(ROOTFS).tmp yum install --assumeyes $(YUMCACHE)/mirrors.kernel.org/fedora-epel/7/x86_64/m/msr-tools-1.3-1.el7.x86_64.rpm sudo chroot $(ROOTFS).tmp yum install --assumeyes $(YUMCACHE)/mirrors.kernel.org/fedora-epel/7/x86_64/v/vconfig-1.9-16.el7.x86_64.rpm + sudo chroot $(ROOTFS).tmp yum install --assumeyes $(YUMCACHE)/mirrors.kernel.org/fedora-epel/7/x86_64/p/pigz-2.3.1-1.el7.x86_64.rpm sudo chroot $(ROOTFS).tmp pip install rpdb sudo chroot $(ROOTFS).tmp pip install ipaddr sudo sh -c "echo 'MaxSessions 300' >> $(ROOTFS).tmp/etc/ssh/sshd_config" diff --git a/py/rackattack/dryrun/main.py b/py/rackattack/dryrun/main.py index df591bf..366e664 100644 --- a/py/rackattack/dryrun/main.py +++ b/py/rackattack/dryrun/main.py @@ -5,7 +5,8 @@ import sys from rackattack.dryrun import servertestresult import traceback -configurelogging.configureLogging('dryrun') +import copy +configurelogging.configureLogging('dryrun', forceDirectory='logs') import yaml import argparse from rackattack import api @@ -19,6 +20,7 @@ from plugins import kernel from plugins import disk from plugins import network +from plugins import logplugin from strato.racktest.hostundertest import host from rackattack.dryrun import dryrunhost from rackattack.dryrun import node @@ -33,7 +35,9 @@ parser.add_argument("--ipAddress", required=True, action='append') parser.add_argument("--targetNode", required=True, action='append') parser.add_argument("--vlan", action='append', default=[], type=int) -parser.add_argument("--debug", default=False, type=bool) +parser.add_argument("--debug", action='store_true') +parser.add_argument("--noClearDisk", action='store_true') + args = parser.parse_args() @@ -55,44 +59,93 @@ def allocateMasterHost(rackuser, label): def _allocateTestNodes(masterHost, hostsToInnagurate): innaguratedHosts = [] logging.info("Going to innagurate %(servers)d servers...be patient", dict(servers=len(hostsToInnagurate))) + + hostDescriptors = [host['props'] for host in hostsToInnagurate] failedNodes, log = masterHost.seed.runCallable(innaugurator.innaugurate, osmosisServerIP=args.osmosisServerIP, rootfsLabel=label, - nodesToInnagurate=hostsToInnagurate) + nodesToInnagurate=hostDescriptors, + noClearDisk=args.noClearDisk, + outputTimeout=30 * 60) if len(failedNodes) > 0: logging.error("Failed to innagurate %(nodes)d nodes log %(log)s", dict(nodes=len(failedNodes), log=log)) - for hostId, host in enumerate(hostsToInnagurate): - if host['hostID'] in failedNodes: - continue - allocatedNode = node.Node(host['hostID'], masterHost, host['macAddress'], host['ipAddress'], hostId) - hostToCheck = dryrunhost.DryRunHost(allocatedNode, dict(username='root', password='dryrun')) - hostToCheck.ssh.connect() - logging.info("Sucessfully connected to node %(node)s", dict(node=hostToCheck.name)) - innaguratedHosts.append(hostToCheck) - return (innaguratedHosts, failedNodes) - - -def printServerResults(results): - passedServers = [result for result in results if result.passed()] - failedServers = [result for result in results if not result.passed()] + + for host in hostsToInnagurate: + if host['name'] in failedNodes: + host['result'].addCheck('init', 'innaugurate', False, failedNodes[host['name']]) + else: + host['result'].addCheck('init', 'innaugurate', True) + host['host'] = dryrunhost.DryRunHost(host['node'], dict(username='root', password='dryrun')) + host['host'].ssh.connect() + innaguratedHosts.append(host) + + return innaguratedHosts + + +def _allocateTestNodesInChunks(masterHost, hostsToInnagurate): + chunks = lambda l, n: [l[x: x + n] for x in xrange(0, len(l), n)] + hostsToInnagurateInChunks = chunks(hostsToInnagurate, 50) + totalInnauguratedHosts = [] + for hostsChunk in hostsToInnagurateInChunks: + totalInnauguratedHosts.extend(_allocateTestNodes(masterHost, hostsChunk)) + return totalInnauguratedHosts + + +def findNetworkCliques(hosts, networksToCheck): + import networkx + netGraph = networkx.Graph() + netGraph.add_nodes_from([host['host'].name for host in hosts]) + networkGraphs = {networkname: netGraph.copy() for networkname in networksToCheck} + + for host in hosts: + netChecks = host['result']['net'] + if netChecks is not None: + for netCheck in netChecks: + checkName = netCheck[0] + extra = netCheck[3] + if netCheck[1]: + if 'ping on' in checkName and extra is not None: + (netName, srcHost, dstHost) = extra + networkGraphs[netName].add_edge(srcHost, dstHost) + + networkCliques = {networkName: list(networkx.find_cliques(networkGraph)) for networkName, networkGraph in networkGraphs.items()} + return networkCliques + + +def printServerResults(hosts): + passedServers = [] + failedServers = [] + for host in hosts.values(): + (passedServers, failedServers)[0 if host['result'].passed() else 1].append(host['result']) + print "TOTALLY %d PASSED %d FAILED" % (len(passedServers), len(failedServers)) print "*********************FAILED SERVERS*******************************" pp = pprint.PrettyPrinter(indent=4) for server in failedServers: - pp.pprint("%(name) - %(summary)s" % dict(name=server['name'], summary=server['summary'])) + print("%(name)s - %(summary)s" % dict(name=server['name'], summary=str(server.summary()))) print "*********************FAILED SERVERS DETAILS*******************************" pp.pprint(failedServers) print "*********************FAILED SERVERS DETAILS*******************************" +def analyzeNetworks(hosts, vlans): + cliques = findNetworkCliques(hosts, vlans + ['untaged']) + print "*********************NETWORK CLIQUES*******************************" + pp = pprint.PrettyPrinter(indent=4) + for netName, networks in cliques.items(): + pp.pprint("%(name)s - %(networks)s" % dict(name=str(netName), networks=networks)) + + def printHostsThatFailedInnaguration(failedHosts): for hostID, log in failedHosts.items(): logging.error('Host %(host)s failed innauguration serial log %(log)s', dict(host=hostID, log=log)) -def _initializeFastNetworkOnHost(hostToInitialize, vtags, testResult): - logging.info("Init Fast network in host %(host)s", dict(host=hostToInitialize.name)) +def _initializeFastNetworkOnHost(host, vtags): + logging.info("Init Fast network in host %(host)s", dict(host=host['name'])) + hostToInitialize = host['host'] + testResult = host['result'] try: hostToInitialize.network.initialize() except: @@ -122,14 +175,61 @@ def _initializeFastNetworkOnHost(hostToInitialize, vtags, testResult): return False -def _initializeFastNetworkOnTestHosts(hostsMap, vtags): - jobs = {host: (_initializeFastNetworkOnHost, host, vtags, testResult) - for host, testResult in hostsMap.items()} +def _initializeFastNetworkOnTestHosts(hosts, vtags): + jobs = {host['name']: (_initializeFastNetworkOnHost, host, vtags) for host in hosts} results = concurrently.run(jobs) - initializedHosts = {resultHost: hostsMap[resultHost] for resultHost, result in results.items() if result} + initializedHosts = [host for host in hosts if results[host['name']]] return initializedHosts + +def _downloadHostsLogs(hosts): + try: + jobs = {host.name: (host.log.prepareAndDownload, '/var/log') + for host in hosts} + concurrently.run(jobs, numberOfThreads=10) + except: + logging.exception("Failed to dowwnload logs") + + +def _powerOffServerViaIPMI(hostToPowerOff): + serverIpmi = ipmi.IPMI(hostToPowerOff['ipmiHost'], + hostToPowerOff['ipmiUsername'], + hostToPowerOff['ipmiPassword']) + try: + serverIpmi._powerCommand('off') + return True + except: + logging.exception("Failed to power off %(host)s" % dict(host=hostToPowerOff['ipmiHost'])) + return False + + +def _powerOffServers(hosts): + jobs = {name: (_powerOffServerViaIPMI, hostToPowerOff['props']) + for name, hostToPowerOff in hosts.items()} + results = concurrently.run(jobs, numberOfThreads=30) + + sucessfullyPoweredOffHosts = [] + for hostId, result in results.items(): + if not result: + hosts[hostId]['result'].addCheck('init', 'IPMI power off', False, "Failed to connect via IPMI to %s" % hosts[hostId]['props']['ipmiHost']) + else: + hosts[hostId]['result'].addCheck('init', 'IPMI power off', True, '') + sucessfullyPoweredOffHosts.append(hosts[hostId]) + return sucessfullyPoweredOffHosts + + +def _createResultsMap(masterHost, hostsToInnagurate): + hostsMap = {} + for hostId, host in enumerate(hostsToInnagurate): + hostsMap[host['hostID']] = {'name': host['hostID'], + 'node': node.Node(host['hostID'], masterHost, host['macAddress'], host['ipAddress'], hostId), + 'props': host, + 'host': None, + 'result': servertestresult.ServerTestResult(host['hostID'])} + return hostsMap + + with open(args.rackYaml) as f: rackYaml = yaml.load(f) @@ -154,37 +254,35 @@ def _initializeFastNetworkOnTestHosts(hostsMap, vtags): ipmiHost=ipmiHost, ipmiUsername=ipmiUsername, ipmiPassword=ipmiPassword)) -innaguratedHosts = [] -testResults = [] + exitCode = -1 +hosts = _createResultsMap(masterHost, hostsToInnagurate) +poweredOnHosts = [] try: - innaguratedHosts, failedHosts = _allocateTestNodes(masterHost, hostsToInnagurate) - for failedHost, log in failedHosts.items(): - testResult = servertestresult.ServerTestResult(failedHost) - testResult.addCheck('init', 'innaugarate', False, log) - testResults.append(testResult) + logging.info("Powering hosts off before start") + hostsToProced = _powerOffServers(hosts) + poweredOnHosts = _allocateTestNodesInChunks(masterHost, hostsToProced) logging.info('Going to test servers %(names)s', - dict(names=' '.join([innaguratedHost.name for innaguratedHost in innaguratedHosts]))) - - hostsResultsMap = {innaguratedHost: servertestresult.ServerTestResult(innaguratedHost.name) - for innaguratedHost in innaguratedHosts} - testResults.extend(hostsResultsMap.values()) - hostsToRunCheckOnMap = _initializeFastNetworkOnTestHosts(hostsResultsMap, vtags) - if len(hostsToRunCheckOnMap) > 0: - healthchecher.checkServers(masterHost, hostsToRunCheckOnMap, vtags) - exitCode = 0 if len([testResult for testResult in testResults if not testResult.passed()]) == 0 else -1 + dict(names=' '.join([innaguratedHost['name'] for innaguratedHost in hostsToProced]))) + + hostsToProced = _initializeFastNetworkOnTestHosts(poweredOnHosts, vtags) + if len(hostsToProced) > 0: + logging.info("Going to check %(servers)d servers", dict(servers=len(hostsToProced))) + healthchecher.checkServers(masterHost, hostsToProced, vtags) + + exitCode = 0 if len([host for host in hosts.values() if not host['result'].passed()]) == 0 else -1 except: logging.exception("Failed running test script") finally: - printServerResults(testResults) - if args.debug: - import ipdb - ipdb.set_trace() - if len(innaguratedHosts) > 0: - logging.info("Powering hosts off") - jobs = {innaguratedHost.name: (ipmi.IPMI(ipmiHost, ipmiUsername, ipmiPassword)._powerCommand, 'off') - for innaguratedHost in innaguratedHosts} - concurrently.run(jobs) + try: + _downloadHostsLogs([masterHost] + [host['host'] for host in poweredOnHosts]) + printServerResults(hosts) + analyzeNetworks(poweredOnHosts, vtags) + finally: + if args.debug: + import ipdb + ipdb.set_trace() + _powerOffServers(hosts) logging.info('PASSED' if exitCode == 0 else 'FAILED') sys.exit(exitCode) diff --git a/py/rackattack/dryrun/plugins/logplugin.py b/py/rackattack/dryrun/plugins/logplugin.py new file mode 100644 index 0000000..11bb871 --- /dev/null +++ b/py/rackattack/dryrun/plugins/logplugin.py @@ -0,0 +1,46 @@ +from strato.racktest.hostundertest import plugins +from strato.common import log +import logging +import time +import os + + +TAR_COMMAND = """tar -c --warning=no-file-changed --use-compress-program=pigz -f %(targetpath)s %(srcPath)s +exitcode=$? +if [ "$exitcode" != "1" ] && [ "$exitcode" != "0" ]; then + exit $exitcode +fi +exit 0 +""" + + +class LogPlugin: + + def __init__(self, host): + self._host = host + + def prepareForDownload(self, path): + tarFileName = LogPlugin._remoteTarFileName() + tarFilePath = os.path.join("/tmp", tarFileName) + self._host.ssh.run.script(TAR_COMMAND % dict(targetpath=tarFilePath, srcPath=path)) + return tarFilePath + + def download(self, tarFilePath): + localTarPath = self._localTarFilePath(log.config.LOGS_DIRECTORY) + localTarDir = os.path.dirname(localTarPath) + if not os.path.exists(localTarDir): + os.makedirs(localTarDir) + self._host.ssh.ftp.getFile(tarFilePath, localTarPath) + + def prepareAndDownload(self, path): + zipedFilePath = self.prepareForDownload(path) + self.download(zipedFilePath) + + def _localTarFilePath(self, localDir): + return os.path.join(localDir, "logs.%(hostName)s" % dict(hostName=self._host.name), "logs.tar.gz") + + @staticmethod + def _remoteTarFileName(): + return "racktest.logplugin.%s.tar.gz" % time.strftime("%Y%m%d%H%M%S") + +plugins.register("log", LogPlugin) diff --git a/py/rackattack/dryrun/seeds/innaugurator.py b/py/rackattack/dryrun/seeds/innaugurator.py index ef12371..f666c76 100644 --- a/py/rackattack/dryrun/seeds/innaugurator.py +++ b/py/rackattack/dryrun/seeds/innaugurator.py @@ -1,7 +1,11 @@ import logging import socket import functools -logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') +import shutil +import datetime +import os +from strato.common.log import configurelogging +configurelogging.configureLogging('innaurugrator') import argparse import threading from rackattack.common import tftpboot @@ -12,6 +16,9 @@ from rackattack.common import globallock import time import network +from strato.common.multithreading import concurrently +from rackattack.physical import config +config.SERIAL_LOGS_DIRECTORY = '/var/log/rackattack' class Waiter: @@ -55,18 +62,18 @@ def _rawTCPConnect(tcpEndpoint): def inaugurateCheckIn(inaugurateInstance, innaguratedNode, rootfsLabel, notifier): - logging.info("Inaugurator checked in") + logging.info("Inaugurator checked in for node %(ip)s", dict(ip=innaguratedNode['ipAddress'])) inaugurateInstance.provideLabel(ipAddress=innaguratedNode['ipAddress'], label=rootfsLabel) notifier.notifyOne(innaguratedNode) def inaugurateDone(innaguratedNode, notifier): - logging.info("Inaugurator checked in") + logging.info("Inaugurator Done for node %(ip)s", dict(ip=innaguratedNode['ipAddress'])) notifier.notifyOne(innaguratedNode) def _prepareForInnauguration(dnsmasqInstance, inaugurateInstance, tftpbootInstance, - nodesToInnagurate, rootfsLabel, checkinWaiter, doneWaiter): + nodesToInnagurate, rootfsLabel, checkinWaiter, doneWaiter, noClearDisk): with globallock.lock(): for nodeToInnaugurate in nodesToInnagurate: dnsmasqInstance.add(nodeToInnaugurate['macAddress'], nodeToInnaugurate['ipAddress']) @@ -83,10 +90,34 @@ def _prepareForInnauguration(dnsmasqInstance, inaugurateInstance, tftpbootInstan doneCallback=doneCallback) tftpbootInstance.configureForInaugurator(nodeToInnaugurate['macAddress'], nodeToInnaugurate['ipAddress'], - clearDisk=True) + clearDisk=not noClearDisk) -def innaugurate(osmosisServerIP, rootfsLabel, nodesToInnagurate): +def _waitServersToInitializeNetwork(nodesToWaitForIp): + def waitServerToInitNetwork(server): + try: + waitForTCPServer(server['ipAddress'], 22) + return True + except: + logging.exception("Failed to wait for active ssh connection on %(node)s", + dict(node=server['hostID'])) + return False + + jobs = {host['hostID']: (waitServerToInitNetwork, host) for host in nodesToWaitForIp} + results = concurrently.run(jobs) + + return [node for node in nodesToWaitForIp if not results[node['hostID']]] + + +def _powerCycleServer(nodeToInnaugurate): + ipmiInstance = ipmi.IPMI(nodeToInnaugurate['ipmiHost'], + nodeToInnaugurate['ipmiUsername'], + nodeToInnaugurate['ipmiPassword']) + ipmiInstance.forceBootFrom('pxe') + ipmiInstance.powerCycle() + + +def innaugurate(osmosisServerIP, rootfsLabel, nodesToInnagurate, noClearDisk): network.dropFirewall() logging.info("MyIP: %(ip)s", dict(ip=network.myIP())) @@ -103,8 +134,7 @@ def innaugurate(osmosisServerIP, rootfsLabel, nodesToInnagurate): tftpboot=tftpbootInstance, serverIP=network.myIP(), netmask=network.netmask(), - firstIP=nodesToInnagurate[0]['ipAddress'], - lastIP=nodesToInnagurate[0]['ipAddress'], + ipAddress=nodesToInnagurate[0]['ipAddress'], gateway=network.gateway(), nameserver=network.myIP()) @@ -117,7 +147,7 @@ def innaugurate(osmosisServerIP, rootfsLabel, nodesToInnagurate): doneWaiters = Waiter(nodesToInnagurate) _prepareForInnauguration(dnsmasqInstance, inaugurateInstance, tftpbootInstance, nodesToInnagurate, rootfsLabel, - checkinWaiters, doneWaiters) + checkinWaiters, doneWaiters, noClearDisk) solReaders = dict() for nodeToInnaugurate in nodesToInnagurate: sol = serialoverlan.SerialOverLan(nodeToInnaugurate['ipmiHost'], @@ -126,34 +156,25 @@ def innaugurate(osmosisServerIP, rootfsLabel, nodesToInnagurate): nodeToInnaugurate['hostID']) solReaders[nodeToInnaugurate['macAddress']] = sol - for nodeToInnaugurate in nodesToInnagurate: - ipmiInstance = ipmi.IPMI(nodeToInnaugurate['ipmiHost'], - nodeToInnaugurate['ipmiUsername'], - nodeToInnaugurate['ipmiPassword']) - ipmiInstance.forceBootFrom('pxe') - ipmiInstance.powerCycle() + jobs = {nodeToInnaugurate['ipmiHost']: (_powerCycleServer, nodeToInnaugurate) for nodeToInnaugurate in nodesToInnagurate} + concurrently.run(jobs) logging.info("Waiting for inaugurator to check in") failedNodesList = [] failedToCheckinNodes = checkinWaiters.waitAll(timeout=10 * 60) + nodesToInnagurate = [node for node in nodesToInnagurate if node not in failedToCheckinNodes] logging.error("Failed to checkin nodes %(nodes)s", dict(nodes=failedToCheckinNodes)) for nodeNotToWaitDone in failedToCheckinNodes: doneWaiters.notifyOne(nodeNotToWaitDone) - notDoneNodes = doneWaiters.waitAll(timeout=10 * 60) + notDoneNodes = doneWaiters.waitAll(timeout=7 * 60) + nodesToInnagurate = [node for node in nodesToInnagurate if node not in notDoneNodes] failedNodesList.extend(failedToCheckinNodes) failedNodesList.extend(notDoneNodes) logging.error("Failed to finish nodes %(nodes)s", dict(nodes=notDoneNodes)) nodesToWaitForIp = [node for node in nodesToInnagurate if node not in notDoneNodes] - # Now wait for all servers to obtain an IP - for nodeToInnaugurate in nodesToWaitForIp: - try: - waitForTCPServer(nodeToInnaugurate['ipAddress'], 22) - except: - logging.exception("Failed to wait for active ssh connection on %(node)s", - dict(node=nodesToInnagurate['hostID'])) - failedNodesList.append(nodeToInnaugurate) - + failedNodesList.extend(_waitServersToInitializeNetwork(nodesToWaitForIp)) failedNodes = {node['hostID']: open(solReaders[node['macAddress']].serialLogFilename()).read() for node in failedNodesList} + shutil.copy(dnsmasqInstance._logFile.name, '/var/log/rackattack/') return failedNodes From ac0bdf5e4ccbe97ec8549c4e32987f5b83072356 Mon Sep 17 00:00:00 2001 From: Alexander Solganik Date: Tue, 3 Feb 2015 22:34:03 +0200 Subject: [PATCH 23/27] fix in print in case of vt is not enabled in bios --- py/rackattack/dryrun/healthchecher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/py/rackattack/dryrun/healthchecher.py b/py/rackattack/dryrun/healthchecher.py index 7137576..7bb96ed 100644 --- a/py/rackattack/dryrun/healthchecher.py +++ b/py/rackattack/dryrun/healthchecher.py @@ -16,7 +16,7 @@ def _verifyVmxEnabledByBios(host, resultObject): result = True log = '' if(regvalue & VMX_ENABLED) != VMX_ENABLED: - log = "VMX is not enabled in bios register val %(regvalue)" % dict(regvalue=hex(regvalue)) + log = "VMX is not enabled in bios register val %(regvalue)x" % dict(regvalue=int(regvalue)) result = False resultObject.addCheck('virt', 'virtualization bios', result, log) From 86979651344f868daca896d082d51384fb69dde4 Mon Sep 17 00:00:00 2001 From: Alexander Solganik Date: Tue, 3 Feb 2015 22:34:33 +0200 Subject: [PATCH 24/27] ignore logs folder --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index e8702d9..65eb7ff 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,5 @@ build .project .pydevproject +logs + From ff04381495982c969ab3c9e23ea71d6b943271fb Mon Sep 17 00:00:00 2001 From: Alexander Solganik Date: Tue, 3 Feb 2015 22:35:21 +0200 Subject: [PATCH 25/27] innaugurator fix: use correct gateway address --- py/rackattack/dryrun/seeds/innaugurator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/py/rackattack/dryrun/seeds/innaugurator.py b/py/rackattack/dryrun/seeds/innaugurator.py index f666c76..28e569c 100644 --- a/py/rackattack/dryrun/seeds/innaugurator.py +++ b/py/rackattack/dryrun/seeds/innaugurator.py @@ -125,7 +125,7 @@ def innaugurate(osmosisServerIP, rootfsLabel, nodesToInnagurate, noClearDisk): netmask=network.netmask(), inauguratorServerIP=network.myIP(), osmosisServerIP=osmosisServerIP, - inauguratorGatewayIP=network.myIP(), + inauguratorGatewayIP=network.gateway(), rootPassword="dryrun", withLocalObjectStore=True) dnsmasq.DNSMasq.eraseLeasesFile() @@ -166,7 +166,7 @@ def innaugurate(osmosisServerIP, rootfsLabel, nodesToInnagurate, noClearDisk): logging.error("Failed to checkin nodes %(nodes)s", dict(nodes=failedToCheckinNodes)) for nodeNotToWaitDone in failedToCheckinNodes: doneWaiters.notifyOne(nodeNotToWaitDone) - notDoneNodes = doneWaiters.waitAll(timeout=7 * 60) + notDoneNodes = doneWaiters.waitAll(timeout=15 * 60) nodesToInnagurate = [node for node in nodesToInnagurate if node not in notDoneNodes] failedNodesList.extend(failedToCheckinNodes) failedNodesList.extend(notDoneNodes) From 425f43f77ae8c8b5a9ba23ef019699595e88e24e Mon Sep 17 00:00:00 2001 From: Alexander Solganik Date: Tue, 3 Feb 2015 22:36:27 +0200 Subject: [PATCH 26/27] moving to centos7 rootfs --- Makefile | 2 +- py/rackattack/dryrun/main.py | 2 +- solvent.manifest | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 4c2a812..5ac18a8 100644 --- a/Makefile +++ b/Makefile @@ -30,7 +30,7 @@ $(ROOTFS): build/smartctl -sudo mv $(ROOTFS) $(ROOTFS).tmp echo "Bringing source" -mkdir $(@D) - sudo -E solvent bring --repositoryBasename=rootfs-basic --product=rootfs --destination=$(ROOTFS).tmp + sudo -E solvent bring --repositoryBasename=rootfs-centos7-basic --product=rootfs --destination=$(ROOTFS).tmp sudo chroot $(ROOTFS).tmp yum install $(RPMS_TO_INSTALL) --assumeyes sudo mkdir $(ROOTFS).tmp/usr/share/inaugurator sudo cp ../inaugurator/build/inaugurator.thin.initrd.img ../inaugurator/build/inaugurator.vmlinuz $(ROOTFS).tmp/usr/share/inaugurator diff --git a/py/rackattack/dryrun/main.py b/py/rackattack/dryrun/main.py index 366e664..101e169 100644 --- a/py/rackattack/dryrun/main.py +++ b/py/rackattack/dryrun/main.py @@ -46,7 +46,7 @@ def allocateMasterHost(rackuser, label): client = clientfactory.factory() logging.info("Allocating master node") allocationInfo = api.AllocationInfo(user=rackuser, purpose="dryrun") - requirements = dict(master=api.Requirement(imageLabel=label, imageHint="rootfs-basic")) + requirements = dict(master=api.Requirement(imageLabel=label, imageHint="rootfs-centos7-basic")) allocation = client.allocate(requirements, allocationInfo) allocation.wait(timeout=5 * 60) logging.info("Allocation successful, waiting for ssh") diff --git a/solvent.manifest b/solvent.manifest index 4412c49..420f461 100644 --- a/solvent.manifest +++ b/solvent.manifest @@ -1,5 +1,5 @@ requirements: -- hash: 2c065fef7b3323e4449a36e368486b61d2e25e4f - originURL: https://github.com/Stratoscale/rootfs-basic.git +- hash: 0bdc94ee45ce920dd5454f06b7bdb8cb9607c32b + originURL: https://github.com/Stratoscale/rootfs-centos7-basic.git - hash: ec6af434552d852d7ad217402e5c7393102bb102 originURL: https://github.com/Stratoscale/rootfs-build.git From 818f7cbce324d44fda3918f0209d6717aa6ad9f4 Mon Sep 17 00:00:00 2001 From: Alexander Solganik Date: Tue, 3 Feb 2015 23:00:58 +0200 Subject: [PATCH 27/27] print modules list when failed to unload module --- py/rackattack/dryrun/plugins/kernel.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/py/rackattack/dryrun/plugins/kernel.py b/py/rackattack/dryrun/plugins/kernel.py index d4e24a7..61ad0e4 100644 --- a/py/rackattack/dryrun/plugins/kernel.py +++ b/py/rackattack/dryrun/plugins/kernel.py @@ -35,7 +35,12 @@ def removeKernelModuleIfLoaded(self, module): def removeKernelModule(self, module): TIME_WAIT_FOR_RMMOD_TO_SUCCEEDD = 10 - waittonotthrow.WaitToNotThrow(timeout=TIME_WAIT_FOR_RMMOD_TO_SUCCEEDD).wait(lambda: self._host.ssh.run.script("rmmod %s" % module)) + try: + waittonotthrow.WaitToNotThrow(timeout=TIME_WAIT_FOR_RMMOD_TO_SUCCEEDD).wait(lambda: self._host.ssh.run.script("rmmod %s" % module)) + except: + logging.exception("Failed to remove module %(module)s lsmod=%(lsmod)s", + dict(module=module, lsmod=self._host.ssh.run.script("lsmod"))) + raise def isModuleLoaded(self, module): output = self._host.ssh.run.script("lsmod")