Automatically retry aborted builds

[?]
Jun 17, 2015, 9:45 AM
OCZ4LSGGSCMSLGC3C32D5JUYYHS5CIPOKOAMADEFAFZOFXJ3YY3AC

Dependencies

  • [2] 6TY4LNHH Finish copyClosure
  • [3] MSIHMO45 Tweak build steps
  • [4] GKZN4UV7 Make the queue monitor more robust, and better debug output
  • [5] RQUAATWB Add status dump facility
  • [6] 2GUAKGTB Fix indentation of build.tt
  • [7] FQQRJUO4 Mark builds as busy
  • [8] RYTQLATY Keep track of failed paths in the Hydra database
  • [9] 24BMQDZA Start of single-process hydra-queue-runner
  • [10] PH3DFCNU Render machine correctly if it doesn't contain @
  • [11] NJJ7H64S Very basic multi-threaded queue runner
  • [12] ENXUSMSV Make concurrency more robust
  • [13] 5AIYUMTB Basic remote building
  • [14] T2EIYJNG On SIGINT, shut down the builder threads
  • [15] JGLE5BRN Add separate build step status codes for cached failures and timeouts
  • [16] C6HOMHZW Don't try to handle SIGINT
  • [17] N5O7VEEO Immediately abort builds that require an unsupported system type
  • [18] IWB3F4Z6 Fail builds with previously failed steps early
  • [19] LJILHOJ7 Create BuildSteps race-free
  • [*] J5UVLXOK * Start of a basic Catalyst web interface.
  • [*] N22GPKYT * Put info about logs / build products in the DB.
  • [*] 4HPT4SDD Revert "Remove now-unused SystemTypes table"

Change contents

  • replacement in src/hydra-queue-runner/build-remote.cc at line 87
    [6.2296][6.2296:2374]()
    printMsg(lvlError, format("sending %1% missing paths") % missing.size());
    [6.2296]
    [2.285]
    printMsg(lvlDebug, format("sending %1% missing paths") % missing.size());
  • replacement in src/hydra-queue-runner/build-remote.cc at line 131
    [6.3414][6.3414:3505]()
    writeInt(SERVE_MAGIC_1, to);
    writeInt(SERVE_PROTOCOL_VERSION, to);
    to.flush();
    [6.3414]
    [6.3505]
    try {
    writeInt(SERVE_MAGIC_1, to);
    writeInt(SERVE_PROTOCOL_VERSION, to);
    to.flush();
  • replacement in src/hydra-queue-runner/build-remote.cc at line 136
    [6.3506][6.3506:3876]()
    unsigned int magic = readInt(from);
    if (magic != SERVE_MAGIC_2)
    throw Error(format("protocol mismatch with ‘nix-store --serve’ on ‘%1%’") % sshName);
    unsigned int version = readInt(from);
    if (GET_PROTOCOL_MAJOR(version) != 0x200)
    throw Error(format("unsupported ‘nix-store --serve’ protocol version on ‘%1%’") % sshName);
    [6.3506]
    [6.3876]
    unsigned int magic = readInt(from);
    if (magic != SERVE_MAGIC_2)
    throw Error(format("protocol mismatch with ‘nix-store --serve’ on ‘%1%’") % sshName);
    unsigned int version = readInt(from);
    if (GET_PROTOCOL_MAJOR(version) != 0x200)
    throw Error(format("unsupported ‘nix-store --serve’ protocol version on ‘%1%’") % sshName);
    } catch (EndOfFile & e) {
    child.pid.wait(true);
    throw Error(format("cannot connect to ‘%1%’: %2%") % sshName % chomp(readFile(logFile)));
    }
  • replacement in src/hydra-queue-runner/build-remote.cc at line 148
    [6.3911][6.3911:4008]()
    printMsg(lvlError, format("sending closure of ‘%1%’ to ‘%2%’") % drvPath % sshName);
    [6.3911]
    [6.4008]
    printMsg(lvlDebug, format("sending closure of ‘%1%’ to ‘%2%’") % drvPath % sshName);
  • replacement in src/hydra-queue-runner/build-remote.cc at line 152
    [6.4089][6.4089:4176]()
    printMsg(lvlError, format("building ‘%1%’ on ‘%2%’") % drvPath % sshName);
    [6.4089]
    [6.4176]
    printMsg(lvlDebug, format("building ‘%1%’ on ‘%2%’") % drvPath % sshName);
  • replacement in src/hydra-queue-runner/build-remote.cc at line 170
    [6.4825][6.4825:4924]()
    printMsg(lvlError, format("copying outputs of ‘%1%’ from ‘%2%’") % drvPath % sshName);
    [6.4825]
    [6.4924]
    printMsg(lvlDebug, format("copying outputs of ‘%1%’ from ‘%2%’") % drvPath % sshName);
  • edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 7
    [6.286]
    [6.286]
    #include <cmath>
    #include <chrono>
  • edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 23
    [6.5058]
    [6.21]
  • edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 25
    [6.22]
    [6.22]
    const int maxTries = 5;
    const int retryInterval = 60; // seconds
    const float retryBackoff = 3.0;
  • edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 29
    [6.23]
    [6.23]
    typedef std::chrono::time_point<std::chrono::system_clock> system_time;
  • edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 113
    [6.626]
    [6.626]
    /* Number of times we've tried this step. */
    unsigned int tries = 0;
    /* Point in time after which the step can be retried. */
    system_time after;
  • replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 127
    [6.5994][6.658:676](),[6.676][4.66:135](),[4.135][6.5921:5927](),[6.745][6.5921:5927]()
    ~Step()
    {
    printMsg(lvlDebug, format("destroying step %1%") % drvPath);
    }
    [6.5994]
    [6.5927]
    ~Step() { }
  • replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 214
    [5.27][5.27:64]()
    std::atomic<int> nrQueueWakeups;
    [5.27]
    [6.7253]
    std::atomic<unsigned int> nrRetries;
    std::atomic<unsigned int> maxNrRetries;
    std::atomic<unsigned int> nrQueueWakeups;
    std::atomic<unsigned int> nrDispatcherWakeups;
  • replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 265
    [6.7634][6.7634:7704]()
    void doBuildStep(std::shared_ptr<StoreAPI> store, Step::ptr step,
    [6.7634]
    [6.7704]
    /* Perform the given build step. Return true if the step is to be
    retried. */
    bool doBuildStep(std::shared_ptr<StoreAPI> store, Step::ptr step,
  • replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 283
    [6.7503][5.90:114]()
    nrQueueWakeups = 0;
    [6.7503]
    [5.114]
    nrRetries = maxNrRetries = nrQueueWakeups = nrDispatcherWakeups = 0;
  • edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 820
    [6.2578]
    [6.2622]
    auto sleepUntil = system_time::max();
  • edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 829
    [6.10590]
    [6.10590]
    system_time now = std::chrono::system_clock::now();
  • edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 838
    [6.10848]
    [6.10848]
    }
    /* Skip previously failed steps that aren't ready to
    be retried. */
    {
    auto step_(step->state.lock());
    if (step_->tries > 0 && step_->after > now) {
    if (step_->after < sleepUntil)
    sleepUntil = step_->after;
    ++i;
    continue;
    }
  • replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 870
    [6.11703][6.11703:11744]()
    dispatcherWakeup.wait(lock);
    [6.11703]
    [6.11744]
    printMsg(lvlDebug, format("dispatcher sleeping for %1%s") %
    std::chrono::duration_cast<std::chrono::seconds>(sleepUntil - std::chrono::system_clock::now()).count());
    dispatcherWakeup.wait_until(lock, sleepUntil);
    nrDispatcherWakeups++;
  • edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 911
    [6.12653]
    [6.12653]
    bool retry = true;
  • replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 915
    [6.12712][6.12712:12768]()
    doBuildStep(store, step, reservation->machine);
    [6.12712]
    [6.12768]
    retry = doBuildStep(store, step, reservation->machine);
  • replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 917
    [6.12803][6.0:96](),[6.96][6.12901:12955](),[6.12901][6.12901:12955]()
    printMsg(lvlError, format("error building ‘%1%’: %2%") % step->drvPath % e.what());
    // FIXME: put step back in runnable and retry
    [6.12803]
    [6.12955]
    printMsg(lvlError, format("uncaught exception building ‘%1%’ on ‘%2%’: %3%")
    % step->drvPath % reservation->machine->sshName % e.what());
  • edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 925
    [6.13097]
    [6.13139]
    /* If there was a temporary failure, retry the step after an
    exponentially increasing interval. */
    if (retry) {
    {
    auto step_(step->state.lock());
    step_->tries++;
    nrRetries++;
    if (step_->tries > maxNrRetries) maxNrRetries = step_->tries; // yeah yeah, not atomic
    int delta = retryInterval * powf(retryBackoff, step_->tries - 1);
    printMsg(lvlInfo, format("will retry ‘%1%’ after %2%s") % step->drvPath % delta);
    step_->after = std::chrono::system_clock::now() + std::chrono::seconds(delta);
    }
    makeRunnable(step);
    }
  • replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 944
    [6.13143][6.13143:13216]()
    void State::doBuildStep(std::shared_ptr<StoreAPI> store, Step::ptr step,
    [6.13143]
    [6.13216]
    bool State::doBuildStep(std::shared_ptr<StoreAPI> store, Step::ptr step,
  • replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 961
    [6.7389][6.7389:7457]()
    are gone (e.g. cancelled). So don't bother. (This is
    [6.7389]
    [6.7457]
    are gone (e.g. cancelled). So don't bother. This is
  • replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 964
    [6.7581][6.7581:7680]()
    Build). FIXME: what if a new Build gets a reference to
    this step? */
    [6.7581]
    [6.7680]
    Build. However, it's possible that a new Build just
    created a reference to this step. So to handle that
    possibility, we retry this step (putting it back in
    the runnable queue). If there are really no strong
    pointers to the step, it will be deleted. */
  • replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 970
    [6.7770][6.7770:7828]()
    destroyStep(step, false);
    return;
    [6.7770]
    [6.7828]
    return true;
  • replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 990
    [6.143][6.143:224]()
    /* If any of the outputs have previously failed, then don't
    retry. */
    [6.143]
    [6.1233]
    /* If any of the outputs have previously failed, then don't bother
    building again. */
  • replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 1012
    [6.1234][6.1234:1298](),[6.1298][4.2011:2041]()
    printMsg(lvlError, format("ERROR: %1%") % e.msg());
    abort(); // FIXME
    [6.1234]
    [6.1319]
    printMsg(lvlError, format("irregular failure building ‘%1%’ on ‘%2%’: %3%")
    % step->drvPath % machine->sshName % e.msg());
  • edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 1023
    [6.1530]
    [6.8142]
    bool retry = false;
    if (result.status == RemoteResult::rrMiscFailure) {
    auto step_(step->state.lock());
    retry = step_->tries + 1 < maxTries;
    }
  • replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 1035
    [6.8563][6.8563:8569]()
    {
    [6.8563]
    [6.8569]
    if (!retry) {
  • edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 1068
    [6.200]
    [6.1564]
    BuildStatus buildStatus =
    result.status == RemoteResult::rrPermanentFailure ? bsFailed : bsAborted;
    BuildStepStatus buildStepStatus =
    result.status == RemoteResult::rrPermanentFailure ? bssFailed : bssAborted;
  • replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 1075
    [6.1658][6.1658:1742]()
    if (result.status != RemoteResult::rrMiscFailure) result.errorMsg = "";
    [6.1658]
    [6.1742]
    if (buildStatus != bsAborted) result.errorMsg = "";
  • replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 1077
    [6.1743][6.1743:1777]()
    if (!cachedFailure) {
    [6.1743]
    [6.1777]
    if (!cachedFailure && !retry) {
  • replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 1083
    [6.1983][3.270:386]()
    createBuildStep(txn, 0, build2, step, machine->sshName, bssFailed, result.errorMsg, build->id);
    [6.1983]
    [6.2113]
    createBuildStep(txn, 0, build2, step, machine->sshName,
    buildStepStatus, result.errorMsg, build->id);
  • edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 1087
    [6.16984][6.2132:2270]()
    finishBuildStep(txn, result.startTime, result.stopTime, build->id, stepNr, machine->sshName, bssFailed, result.errorMsg);
  • edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 1088
    [6.2284]
    [6.2284]
    if (!cachedFailure)
    finishBuildStep(txn, result.startTime, result.stopTime, build->id,
    stepNr, machine->sshName, buildStepStatus, result.errorMsg);
  • replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 1094
    [6.17060][6.9094:9139](),[6.9139][6.14279:14367](),[6.9139][6.17101:17135](),[6.14367][6.17101:17135](),[6.17101][6.17101:17135](),[6.17135][6.2286:2434](),[6.694][6.17272:17393](),[6.2434][6.17272:17393](),[6.17272][6.17272:17393](),[6.17393][6.14368:14407](),[6.14407][6.2435:2525](),[6.2525][6.17464:17548](),[6.14453][6.17464:17548](),[6.17464][6.17464:17548]()
    for (auto build2 : dependents) {
    printMsg(lvlError, format("marking build %1% as failed") % build2->id);
    txn.parameterized
    ("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $4, isCachedBuild = $5 where id = $1")
    (build2->id)
    ((int) (build2->drvPath == step->drvPath ? bsFailed : bsDepFailed))
    (result.startTime)
    (result.stopTime)
    (cachedFailure ? 1 : 0).exec();
    build2->finishedInDB = true; // FIXME: txn might fail
    }
    [6.17060]
    [6.2526]
    if (!retry)
    for (auto build2 : dependents) {
    printMsg(lvlError, format("marking build %1% as failed") % build2->id);
    txn.parameterized
    ("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $4, isCachedBuild = $5 where id = $1")
    (build2->id)
    ((int) (build2->drvPath != step->drvPath && buildStatus == bsFailed ? bsDepFailed : buildStatus))
    (result.startTime)
    (result.stopTime)
    (cachedFailure ? 1 : 0).exec();
    build2->finishedInDB = true; // FIXME: txn might fail
    }
  • replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 1121
    [6.9391][6.9391:9426](),[6.9426][6.14454:14538](),[6.14538][6.9478:9569](),[6.9478][6.9478:9569]()
    for (auto build2 : dependents)
    if (build2->toplevel == step || result.status != RemoteResult::rrSuccess) {
    auto builds_(builds.lock());
    builds_->erase(build2->id);
    }
    [6.9391]
    [6.9569]
    if (!retry)
    for (auto build2 : dependents)
    if (build2->toplevel == step || result.status != RemoteResult::rrSuccess) {
    auto builds_(builds.lock());
    builds_->erase(build2->id);
    }
  • replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 1131
    [6.9718][6.14539:14604]()
    destroyStep(step, result.status == RemoteResult::rrSuccess);
    [6.9718]
    [6.17669]
    if (!retry)
    destroyStep(step, result.status == RemoteResult::rrSuccess);
    return retry;
  • replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 1202
    [5.934][5.934:1024]()
    printMsg(lvlError, format("%1% times woken up to check the queue") % nrQueueWakeups);
    [5.934]
    [5.1024]
    printMsg(lvlError, format("%1% build step retries") % nrRetries);
    printMsg(lvlError, format("%1% most retries for any build step") % maxNrRetries);
    printMsg(lvlError, format("%1% queue wakeups") % nrQueueWakeups);
    printMsg(lvlError, format("%1% dispatcher wakeups") % nrDispatcherWakeups);
    printMsg(lvlError, format("%1% database connections") % dbPool.count());
  • edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 1229
    [6.3510][6.1130:1131](),[6.1131][4.2120:2235]()
    //printMsg(lvlInfo, "exiting...");
    //printMsg(lvlInfo, format("psql connections = %1%") % dbPool.count());
  • replacement in src/root/build.tt at line 54
    [6.1194][6.1194:1245]()
    <span class="error">Aborted</span>
    [6.1194]
    [6.0]
    <span class="error"><strong>Aborted</strong>[% IF step.errormsg %]: [% HTML.escape(step.errormsg); END %]</span>
  • edit in src/sql/hydra.sql at line 375
    [23.1109]
    [23.1109]
    -- FIXME: remove