Temporarily disable machines after a connection failure

[?]
Jul 21, 2015, 1:53 PM
YR2IM6Y5XA6XDFBWJUFSVNMQQKLIW6A2LDS4F77A3OZLEMBACCRAC

Dependencies

  • [2] CNLNT3T4 Allow only 1 thread to send a closure to a given machine at the same time
  • [3] MHVIT4JY Split hydra-queue-runner.cc more
  • [*] 5AIYUMTB Basic remote building
  • [*] OCZ4LSGG Automatically retry aborted builds
  • [*] GS4BE6TB Asynchronously compress build logs
  • [*] 5LBMP7GA Fix remote building
  • [*] NAYQT2GT hydra-queue-runner: Use cmdBuildDerivation
  • [*] HJOEIMLR Refactor

Change contents

  • edit in src/hydra-queue-runner/build-remote.cc at line 163
    [6.647]
    [7.354]
    {
    /* Disable this machine until a certain period of time has
    passed. This period increases on every consecutive
    failure. However, don't count failures that occurred
    soon after the last one (to take into account steps
    started in parallel). */
    auto info(machine->state->connectInfo.lock());
    auto now = std::chrono::system_clock::now();
    if (info->consecutiveFailures == 0 || info->lastFailure < now - std::chrono::seconds(30)) {
    info->consecutiveFailures = std::min(info->consecutiveFailures + 1, (unsigned int) 4);
    info->lastFailure = now;
    int delta = retryInterval * powf(retryBackoff, info->consecutiveFailures - 1) + (rand() % 30);
    printMsg(lvlInfo, format("will disable machine ‘%1%’ for %2%s") % machine->sshName % delta);
    info->disabledUntil = now + std::chrono::seconds(delta);
    }
    }
  • edit in src/hydra-queue-runner/build-remote.cc at line 185
    [8.7]
    [9.305]
    {
    auto info(machine->state->connectInfo.lock());
    info->consecutiveFailures = 0;
    }
  • replacement in src/hydra-queue-runner/builder.cc at line 36
    [3.1163][3.1163:1241]()
    int delta = retryInterval * powf(retryBackoff, step_->tries - 1);
    [3.1163]
    [3.1241]
    int delta = retryInterval * powf(retryBackoff, step_->tries - 1) + (rand() % 10);
  • edit in src/hydra-queue-runner/dispatcher.cc at line 39
    [3.15294]
    [3.15294]
    system_time now = std::chrono::system_clock::now();
  • replacement in src/hydra-queue-runner/dispatcher.cc at line 43
    [3.15430][3.15430:15499]()
    an ordering. std::sort() can segfault if it isn't. */
    [3.15430]
    [3.15499]
    an ordering. std::sort() can segfault if it isn't. Also
    filter out temporarily disabled machines. */
  • replacement in src/hydra-queue-runner/dispatcher.cc at line 53
    [3.15755][3.15755:15799]()
    for (auto & m : *machines_)
    [3.15755]
    [3.15799]
    for (auto & m : *machines_) {
    auto info(m.second->state->connectInfo.lock());
    if (info->consecutiveFailures && info->disabledUntil > now) {
    if (info->disabledUntil < sleepUntil)
    sleepUntil = info->disabledUntil;
    continue;
    }
  • edit in src/hydra-queue-runner/dispatcher.cc at line 61
    [3.15887]
    [3.15887]
    }
  • edit in src/hydra-queue-runner/dispatcher.cc at line 90
    [3.17173][3.17173:17237]()
    system_time now = std::chrono::system_clock::now();
  • edit in src/hydra-queue-runner/state.hh at line 140
    [2.159]
    [2.159]
    struct ConnectInfo
    {
    system_time lastFailure, disabledUntil;
    unsigned int consecutiveFailures;
    };
    Sync<ConnectInfo> connectInfo;