Temporarily disable machines after a connection failure
[?]
Jul 21, 2015, 1:53 PM
YR2IM6Y5XA6XDFBWJUFSVNMQQKLIW6A2LDS4F77A3OZLEMBACCRACDependencies
- [2]
CNLNT3T4Allow only 1 thread to send a closure to a given machine at the same time - [3]
MHVIT4JYSplit hydra-queue-runner.cc more - [*]
5AIYUMTBBasic remote building - [*]
OCZ4LSGGAutomatically retry aborted builds - [*]
GS4BE6TBAsynchronously compress build logs - [*]
5LBMP7GAFix remote building - [*]
NAYQT2GThydra-queue-runner: Use cmdBuildDerivation - [*]
HJOEIMLRRefactor
Change contents
- edit in src/hydra-queue-runner/build-remote.cc at line 163
{/* Disable this machine until a certain period of time haspassed. This period increases on every consecutivefailure. However, don't count failures that occurredsoon after the last one (to take into account stepsstarted in parallel). */auto info(machine->state->connectInfo.lock());auto now = std::chrono::system_clock::now();if (info->consecutiveFailures == 0 || info->lastFailure < now - std::chrono::seconds(30)) {info->consecutiveFailures = std::min(info->consecutiveFailures + 1, (unsigned int) 4);info->lastFailure = now;int delta = retryInterval * powf(retryBackoff, info->consecutiveFailures - 1) + (rand() % 30);printMsg(lvlInfo, format("will disable machine ‘%1%’ for %2%s") % machine->sshName % delta);info->disabledUntil = now + std::chrono::seconds(delta);}} - edit in src/hydra-queue-runner/build-remote.cc at line 185
{auto info(machine->state->connectInfo.lock());info->consecutiveFailures = 0;} - replacement in src/hydra-queue-runner/builder.cc at line 36
int delta = retryInterval * powf(retryBackoff, step_->tries - 1);int delta = retryInterval * powf(retryBackoff, step_->tries - 1) + (rand() % 10); - edit in src/hydra-queue-runner/dispatcher.cc at line 39
system_time now = std::chrono::system_clock::now(); - replacement in src/hydra-queue-runner/dispatcher.cc at line 43
an ordering. std::sort() can segfault if it isn't. */an ordering. std::sort() can segfault if it isn't. Alsofilter out temporarily disabled machines. */ - replacement in src/hydra-queue-runner/dispatcher.cc at line 53
for (auto & m : *machines_)for (auto & m : *machines_) {auto info(m.second->state->connectInfo.lock());if (info->consecutiveFailures && info->disabledUntil > now) {if (info->disabledUntil < sleepUntil)sleepUntil = info->disabledUntil;continue;} - edit in src/hydra-queue-runner/dispatcher.cc at line 61
} - edit in src/hydra-queue-runner/dispatcher.cc at line 90
system_time now = std::chrono::system_clock::now(); - edit in src/hydra-queue-runner/state.hh at line 140
struct ConnectInfo{system_time lastFailure, disabledUntil;unsigned int consecutiveFailures;};Sync<ConnectInfo> connectInfo;