grahamc/hydra2 - Change OCZ4LSGGSCMSLGC3C32D5JUYYHS5CIPOKOAMADEFAFZOFXJ3YY3AC

Automatically retry aborted builds

Aborted builds are now put back on the runnable queue and retried after a certain time interval (currently 60 seconds for the first retry, then tripled on each subsequent retry).

Created by Eelco Dolstra on June 17, 2015

OCZ4LSGGSCMSLGC3C32D5JUYYHS5CIPOKOAMADEFAFZOFXJ3YY3AC

Dependencies

In channels

main

Change contents

Replacement in src/hydra-queue-runner/build-remote.cc at line 87 [6.85]

B:BD[6.2296] → [6.2296:2374]

    printMsg(lvlError, format("sending %1% missing paths") % missing.size());

[6.2296]

[2.285]

    printMsg(lvlDebug, format("sending %1% missing paths") % missing.size());

Replacement in src/hydra-queue-runner/build-remote.cc at line 131 [6.85]

B:BD[6.3414] → [6.3414:3505]

    writeInt(SERVE_MAGIC_1, to);
    writeInt(SERVE_PROTOCOL_VERSION, to);
    to.flush();

[6.3414]

[6.3505]

    try {
        writeInt(SERVE_MAGIC_1, to);
        writeInt(SERVE_PROTOCOL_VERSION, to);
        to.flush();

Replacement in src/hydra-queue-runner/build-remote.cc at line 136 [6.85]

B:BD[6.3506] → [6.3506:3876]

    unsigned int magic = readInt(from);
    if (magic != SERVE_MAGIC_2)
        throw Error(format("protocol mismatch with ‘nix-store --serve’ on ‘%1%’") % sshName);
    unsigned int version = readInt(from);
    if (GET_PROTOCOL_MAJOR(version) != 0x200)
        throw Error(format("unsupported ‘nix-store --serve’ protocol version on ‘%1%’") % sshName);

[6.3506]

[6.3876]

        unsigned int magic = readInt(from);
        if (magic != SERVE_MAGIC_2)
            throw Error(format("protocol mismatch with ‘nix-store --serve’ on ‘%1%’") % sshName);
        unsigned int version = readInt(from);
        if (GET_PROTOCOL_MAJOR(version) != 0x200)
            throw Error(format("unsupported ‘nix-store --serve’ protocol version on ‘%1%’") % sshName);
    } catch (EndOfFile & e) {
        child.pid.wait(true);
        throw Error(format("cannot connect to ‘%1%’: %2%") % sshName % chomp(readFile(logFile)));
    }

Replacement in src/hydra-queue-runner/build-remote.cc at line 148 [6.85]

B:BD[6.3911] → [6.3911:4008]

    printMsg(lvlError, format("sending closure of ‘%1%’ to ‘%2%’") % drvPath % sshName);

[6.3911]

[6.4008]

    printMsg(lvlDebug, format("sending closure of ‘%1%’ to ‘%2%’") % drvPath % sshName);

Replacement in src/hydra-queue-runner/build-remote.cc at line 152 [6.85]

B:BD[6.4089] → [6.4089:4176]

    printMsg(lvlError, format("building ‘%1%’ on ‘%2%’") % drvPath % sshName);

[6.4089]

[6.4176]

    printMsg(lvlDebug, format("building ‘%1%’ on ‘%2%’") % drvPath % sshName);

Replacement in src/hydra-queue-runner/build-remote.cc at line 170 [6.85]

B:BD[6.4825] → [6.4825:4924]

    printMsg(lvlError, format("copying outputs of ‘%1%’ from ‘%2%’") % drvPath % sshName);

[6.4825]

[6.4924]

    printMsg(lvlDebug, format("copying outputs of ‘%1%’ from ‘%2%’") % drvPath % sshName);

Insertion in src/hydra-queue-runner/hydra-queue-runner.cc at line 7 [8.4840]
[7.286]
[7.286]
```
#include <cmath>
#include <chrono>
```
Insertion in src/hydra-queue-runner/hydra-queue-runner.cc at line 23 [8.4840]
[8.5058]
[9.21]

Insertion in src/hydra-queue-runner/hydra-queue-runner.cc at line 25 [8.4840]

[9.22]

const int maxTries = 5;
const int retryInterval = 60; // seconds
const float retryBackoff = 3.0;

Insertion in src/hydra-queue-runner/hydra-queue-runner.cc at line 29 [8.4840]
[9.23]
[9.23]
```
typedef std::chrono::time_point<std::chrono::system_clock> system_time;
```

Insertion in src/hydra-queue-runner/hydra-queue-runner.cc at line 113 [8.4840]

[9.626]


        /* Number of times we've tried this step. */
        unsigned int tries = 0;
        /* Point in time after which the step can be retried. */
        system_time after;

Replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 127 [8.4840]
B:BD[8.5994] → [9.658:676]
B:BD[9.676] → [4.66:135]
∅:D[4.135] → [6.5921:5927]
B:BD[9.745] → [6.5921:5927]
```
    ~Step()
    {
        printMsg(lvlDebug, format("destroying step %1%") % drvPath);
    }
```
[8.5994]
[6.5927]
```
    ~Step() { }
```

Replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 214 [8.4840]

B:BD[5.27] → [5.27:64]

    std::atomic<int> nrQueueWakeups;

[5.27]

[6.7253]

    std::atomic<unsigned int> nrRetries;
    std::atomic<unsigned int> maxNrRetries;
    std::atomic<unsigned int> nrQueueWakeups;
    std::atomic<unsigned int> nrDispatcherWakeups;

Replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 265 [8.4840]

B:BD[6.7634] → [6.7634:7704]

    void doBuildStep(std::shared_ptr<StoreAPI> store, Step::ptr step,

[6.7634]

[6.7704]

    /* Perform the given build step. Return true if the step is to be
       retried. */
    bool doBuildStep(std::shared_ptr<StoreAPI> store, Step::ptr step,

Replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 283 [8.4840]
B:BD[8.7503] → [5.90:114]
```
    nrQueueWakeups = 0;
```
[8.7503]
[5.114]
```
    nrRetries = maxNrRetries = nrQueueWakeups = nrDispatcherWakeups = 0;
```
Insertion in src/hydra-queue-runner/hydra-queue-runner.cc at line 820 [8.4840]
[7.2578]
[7.2622]
```
        auto sleepUntil = system_time::max();
```
Insertion in src/hydra-queue-runner/hydra-queue-runner.cc at line 829 [8.4840]
[6.10590]
[6.10590]
```
            system_time now = std::chrono::system_clock::now();
```

Insertion in src/hydra-queue-runner/hydra-queue-runner.cc at line 838 [8.4840]

[6.10848]

                }
                /* Skip previously failed steps that aren't ready to
                   be retried. */
                {
                    auto step_(step->state.lock());
                    if (step_->tries > 0 && step_->after > now) {
                        if (step_->after < sleepUntil)
                            sleepUntil = step_->after;
                        ++i;
                        continue;
                    }

Replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 870 [8.4840]

B:BD[6.11703] → [6.11703:11744]

            dispatcherWakeup.wait(lock);

[6.11703]

[6.11744]

            printMsg(lvlDebug, format("dispatcher sleeping for %1%s") %
                std::chrono::duration_cast<std::chrono::seconds>(sleepUntil - std::chrono::system_clock::now()).count());
            dispatcherWakeup.wait_until(lock, sleepUntil);
            nrDispatcherWakeups++;

Insertion in src/hydra-queue-runner/hydra-queue-runner.cc at line 911 [8.4840]
[6.12653]
[6.12653]
```
    bool retry = true;
```

Replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 915 [8.4840]

B:BD[6.12712] → [6.12712:12768]

        doBuildStep(store, step, reservation->machine);

[6.12712]

[6.12768]

        retry = doBuildStep(store, step, reservation->machine);

Replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 917 [8.4840]

B:BD[6.12803] → [10.0:96]

∅:D[10.96] → [6.12901:12955]

B:BD[6.12901] → [6.12901:12955]

        printMsg(lvlError, format("error building ‘%1%’: %2%") % step->drvPath % e.what());
        // FIXME: put step back in runnable and retry

[6.12803]

[6.12955]

        printMsg(lvlError, format("uncaught exception building ‘%1%’ on ‘%2%’: %3%")
            % step->drvPath % reservation->machine->sshName % e.what());

Insertion in src/hydra-queue-runner/hydra-queue-runner.cc at line 925 [8.4840]

[6.13097]

[6.13139]


    /* If there was a temporary failure, retry the step after an
       exponentially increasing interval. */
    if (retry) {
        {
            auto step_(step->state.lock());
            step_->tries++;
            nrRetries++;
            if (step_->tries > maxNrRetries) maxNrRetries = step_->tries; // yeah yeah, not atomic
            int delta = retryInterval * powf(retryBackoff, step_->tries - 1);
            printMsg(lvlInfo, format("will retry ‘%1%’ after %2%s") % step->drvPath % delta);
            step_->after = std::chrono::system_clock::now() + std::chrono::seconds(delta);
        }
        makeRunnable(step);
    }

Replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 944 [8.4840]

B:BD[6.13143] → [6.13143:13216]

void State::doBuildStep(std::shared_ptr<StoreAPI> store, Step::ptr step,

[6.13143]

[6.13216]

bool State::doBuildStep(std::shared_ptr<StoreAPI> store, Step::ptr step,

Replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 961 [8.4840]

B:BD[9.7389] → [9.7389:7457]

               are gone (e.g. cancelled). So don't bother. (This is

[9.7389]

[9.7457]

               are gone (e.g. cancelled). So don't bother. This is

Replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 964 [8.4840]

B:BD[9.7581] → [9.7581:7680]

               Build). FIXME: what if a new Build gets a reference to
               this step? */

[9.7581]

[9.7680]

               Build. However, it's possible that a new Build just
               created a reference to this step. So to handle that
               possibility, we retry this step (putting it back in
               the runnable queue). If there are really no strong
               pointers to the step, it will be deleted. */

Replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 970 [8.4840]
B:BD[9.7770] → [9.7770:7828]
```
            destroyStep(step, false);
            return;
```
[9.7770]
[9.7828]
```
            return true;
```

Replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 990 [8.4840]

B:BD[10.143] → [10.143:224]

    /* If any of the outputs have previously failed, then don't
       retry. */

[10.143]

[11.1233]

    /* If any of the outputs have previously failed, then don't bother
       building again. */

Replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 1012 [8.4840]

B:BD[10.1234] → [10.1234:1298]

B:BD[10.1298] → [4.2011:2041]

            printMsg(lvlError, format("ERROR: %1%") % e.msg());
            abort(); // FIXME

[10.1234]

[10.1319]

            printMsg(lvlError, format("irregular failure building ‘%1%’ on ‘%2%’: %3%")
                % step->drvPath % machine->sshName % e.msg());

Insertion in src/hydra-queue-runner/hydra-queue-runner.cc at line 1023 [8.4840]

[10.1530]

[9.8142]

    bool retry = false;
    if (result.status == RemoteResult::rrMiscFailure) {
        auto step_(step->state.lock());
        retry = step_->tries + 1 < maxTries;
    }

Replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 1035 [8.4840]
B:BD[9.8563] → [9.8563:8569]
```
    {
```
[9.8563]
[9.8569]
```
    if (!retry) {
```

Insertion in src/hydra-queue-runner/hydra-queue-runner.cc at line 1068 [8.4840]

[12.200]

[10.1564]

            BuildStatus buildStatus =
                result.status == RemoteResult::rrPermanentFailure ? bsFailed : bsAborted;
            BuildStepStatus buildStepStatus =
                result.status == RemoteResult::rrPermanentFailure ? bssFailed : bssAborted;

Replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 1075 [8.4840]

B:BD[10.1658] → [10.1658:1742]

            if (result.status != RemoteResult::rrMiscFailure) result.errorMsg = "";

[10.1658]

[10.1742]

            if (buildStatus != bsAborted) result.errorMsg = "";

Replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 1077 [8.4840]
B:BD[10.1743] → [10.1743:1777]
```
            if (!cachedFailure) {
```
[10.1743]
[10.1777]
```
            if (!cachedFailure && !retry) {
```

Replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 1083 [8.4840]

B:BD[10.1983] → [3.270:386]

                    createBuildStep(txn, 0, build2, step, machine->sshName, bssFailed, result.errorMsg, build->id);

[10.1983]

[10.2113]

                    createBuildStep(txn, 0, build2, step, machine->sshName,
                        buildStepStatus, result.errorMsg, build->id);

Deletion in src/hydra-queue-runner/hydra-queue-runner.cc at line 1087 [8.4840]

B:BD[8.16984] → [10.2132:2270]

                finishBuildStep(txn, result.startTime, result.stopTime, build->id, stepNr, machine->sshName, bssFailed, result.errorMsg);

Insertion in src/hydra-queue-runner/hydra-queue-runner.cc at line 1088 [8.4840]

[10.2284]


            if (!cachedFailure)
                finishBuildStep(txn, result.startTime, result.stopTime, build->id,
                    stepNr, machine->sshName, buildStepStatus, result.errorMsg);

Replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 1094 [8.4840]

B:BD[8.17060] → [9.9094:9139]

B:BD[9.9139] → [6.14279:14367]

∅:D[9.9139] → [8.17101:17135]

∅:D[6.14367] → [8.17101:17135]

B:BD[8.17101] → [8.17101:17135]

B:BD[8.17135] → [10.2286:2434]

∅:D[13.694] → [8.17272:17393]

∅:D[10.2434] → [8.17272:17393]

B:BD[8.17272] → [8.17272:17393]

B:BD[8.17393] → [6.14368:14407]

B:BD[6.14407] → [10.2435:2525]

∅:D[10.2525] → [8.17464:17548]

∅:D[6.14453] → [8.17464:17548]

B:BD[8.17464] → [8.17464:17548]

            for (auto build2 : dependents) {
                printMsg(lvlError, format("marking build %1% as failed") % build2->id);
                txn.parameterized
                    ("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $4, isCachedBuild = $5 where id = $1")
                    (build2->id)
                    ((int) (build2->drvPath == step->drvPath ? bsFailed : bsDepFailed))
                    (result.startTime)
                    (result.stopTime)
                    (cachedFailure ? 1 : 0).exec();
                build2->finishedInDB = true; // FIXME: txn might fail
            }

[8.17060]

[10.2526]

            if (!retry)
                for (auto build2 : dependents) {
                    printMsg(lvlError, format("marking build %1% as failed") % build2->id);
                    txn.parameterized
                        ("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $4, isCachedBuild = $5 where id = $1")
                        (build2->id)
                        ((int) (build2->drvPath != step->drvPath && buildStatus == bsFailed ? bsDepFailed : buildStatus))
                        (result.startTime)
                        (result.stopTime)
                        (cachedFailure ? 1 : 0).exec();
                    build2->finishedInDB = true; // FIXME: txn might fail
                }

Replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 1121 [8.4840]

B:BD[9.9391] → [9.9391:9426]

B:BD[9.9426] → [6.14454:14538]

∅:D[6.14538] → [9.9478:9569]

B:BD[9.9478] → [9.9478:9569]

    for (auto build2 : dependents)
        if (build2->toplevel == step || result.status != RemoteResult::rrSuccess) {
            auto builds_(builds.lock());
            builds_->erase(build2->id);
        }

[9.9391]

[9.9569]

    if (!retry)
        for (auto build2 : dependents)
            if (build2->toplevel == step || result.status != RemoteResult::rrSuccess) {
                auto builds_(builds.lock());
                builds_->erase(build2->id);
            }

Replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 1131 [8.4840]

B:BD[9.9718] → [6.14539:14604]

    destroyStep(step, result.status == RemoteResult::rrSuccess);

[9.9718]

[8.17669]

    if (!retry)
        destroyStep(step, result.status == RemoteResult::rrSuccess);
    return retry;

Replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 1202 [8.4840]

B:BD[5.934] → [5.934:1024]

    printMsg(lvlError, format("%1% times woken up to check the queue") % nrQueueWakeups);

[5.934]

[5.1024]

    printMsg(lvlError, format("%1% build step retries") % nrRetries);
    printMsg(lvlError, format("%1% most retries for any build step") % maxNrRetries);
    printMsg(lvlError, format("%1% queue wakeups") % nrQueueWakeups);
    printMsg(lvlError, format("%1% dispatcher wakeups") % nrDispatcherWakeups);
    printMsg(lvlError, format("%1% database connections") % dbPool.count());

Deletion in src/hydra-queue-runner/hydra-queue-runner.cc at line 1229 [8.4840]
B:BD[7.3510] → [14.1130:1131]
B:BD[14.1131] → [4.2120:2235]
```
    //printMsg(lvlInfo, "exiting...");
    //printMsg(lvlInfo, format("psql connections = %1%") % dbPool.count());
```

Replacement in src/root/build.tt at line 54 [16.7350]

B:BD[15.1194] → [15.1194:1245]

                <span class="error">Aborted</span>

[15.1194]

[17.0]

                <span class="error"><strong>Aborted</strong>[% IF step.errormsg %]: [% HTML.escape(step.errormsg); END %]</span>

Insertion in src/sql/hydra.sql at line 375 [19.1]
[18.1109]
[18.1109]
```
-- FIXME: remove
```