Automatically retry aborted builds
[?]
Jun 17, 2015, 9:45 AM
OCZ4LSGGSCMSLGC3C32D5JUYYHS5CIPOKOAMADEFAFZOFXJ3YY3ACDependencies
- [2]
6TY4LNHHFinish copyClosure - [3]
MSIHMO45Tweak build steps - [4]
GKZN4UV7Make the queue monitor more robust, and better debug output - [5]
RQUAATWBAdd status dump facility - [6]
NJJ7H64SVery basic multi-threaded queue runner - [7]
IWB3F4Z6Fail builds with previously failed steps early - [8]
5AIYUMTBBasic remote building - [9]
2GUAKGTBFix indentation of build.tt - [10]
ENXUSMSVMake concurrency more robust - [11]
C6HOMHZWDon't try to handle SIGINT - [12]
LJILHOJ7Create BuildSteps race-free - [13]
N5O7VEEOImmediately abort builds that require an unsupported system type - [14]
T2EIYJNGOn SIGINT, shut down the builder threads - [15]
PH3DFCNURender machine correctly if it doesn't contain @ - [16]
JGLE5BRNAdd separate build step status codes for cached failures and timeouts - [17]
24BMQDZAStart of single-process hydra-queue-runner - [18]
FQQRJUO4Mark builds as busy - [19]
RYTQLATYKeep track of failed paths in the Hydra database - [*]
J5UVLXOK* Start of a basic Catalyst web interface. - [*]
N22GPKYT* Put info about logs / build products in the DB. - [*]
4HPT4SDDRevert "Remove now-unused SystemTypes table"
Change contents
- replacement in src/hydra-queue-runner/build-remote.cc at line 87
printMsg(lvlError, format("sending %1% missing paths") % missing.size());printMsg(lvlDebug, format("sending %1% missing paths") % missing.size()); - replacement in src/hydra-queue-runner/build-remote.cc at line 131
writeInt(SERVE_MAGIC_1, to);writeInt(SERVE_PROTOCOL_VERSION, to);to.flush();try {writeInt(SERVE_MAGIC_1, to);writeInt(SERVE_PROTOCOL_VERSION, to);to.flush(); - replacement in src/hydra-queue-runner/build-remote.cc at line 136
unsigned int magic = readInt(from);if (magic != SERVE_MAGIC_2)throw Error(format("protocol mismatch with ‘nix-store --serve’ on ‘%1%’") % sshName);unsigned int version = readInt(from);if (GET_PROTOCOL_MAJOR(version) != 0x200)throw Error(format("unsupported ‘nix-store --serve’ protocol version on ‘%1%’") % sshName);unsigned int magic = readInt(from);if (magic != SERVE_MAGIC_2)throw Error(format("protocol mismatch with ‘nix-store --serve’ on ‘%1%’") % sshName);unsigned int version = readInt(from);if (GET_PROTOCOL_MAJOR(version) != 0x200)throw Error(format("unsupported ‘nix-store --serve’ protocol version on ‘%1%’") % sshName);} catch (EndOfFile & e) {child.pid.wait(true);throw Error(format("cannot connect to ‘%1%’: %2%") % sshName % chomp(readFile(logFile)));} - replacement in src/hydra-queue-runner/build-remote.cc at line 148
printMsg(lvlError, format("sending closure of ‘%1%’ to ‘%2%’") % drvPath % sshName);printMsg(lvlDebug, format("sending closure of ‘%1%’ to ‘%2%’") % drvPath % sshName); - replacement in src/hydra-queue-runner/build-remote.cc at line 152
printMsg(lvlError, format("building ‘%1%’ on ‘%2%’") % drvPath % sshName);printMsg(lvlDebug, format("building ‘%1%’ on ‘%2%’") % drvPath % sshName); - replacement in src/hydra-queue-runner/build-remote.cc at line 170
printMsg(lvlError, format("copying outputs of ‘%1%’ from ‘%2%’") % drvPath % sshName);printMsg(lvlDebug, format("copying outputs of ‘%1%’ from ‘%2%’") % drvPath % sshName); - edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 7
#include <cmath>#include <chrono> - edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 23
- edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 25
const int maxTries = 5;const int retryInterval = 60; // secondsconst float retryBackoff = 3.0; - edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 29
typedef std::chrono::time_point<std::chrono::system_clock> system_time; - edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 113
/* Number of times we've tried this step. */unsigned int tries = 0;/* Point in time after which the step can be retried. */system_time after; - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 127[6.5994]→[6.658:676](∅→∅),[6.676]→[4.66:135](∅→∅),[4.135]→[6.5921:5927](∅→∅),[6.745]→[6.5921:5927](∅→∅)
~Step(){printMsg(lvlDebug, format("destroying step %1%") % drvPath);}~Step() { } - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 214
std::atomic<int> nrQueueWakeups;std::atomic<unsigned int> nrRetries;std::atomic<unsigned int> maxNrRetries;std::atomic<unsigned int> nrQueueWakeups;std::atomic<unsigned int> nrDispatcherWakeups; - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 265
void doBuildStep(std::shared_ptr<StoreAPI> store, Step::ptr step,/* Perform the given build step. Return true if the step is to beretried. */bool doBuildStep(std::shared_ptr<StoreAPI> store, Step::ptr step, - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 283
nrQueueWakeups = 0;nrRetries = maxNrRetries = nrQueueWakeups = nrDispatcherWakeups = 0; - edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 820
auto sleepUntil = system_time::max(); - edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 829
system_time now = std::chrono::system_clock::now(); - edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 838
}/* Skip previously failed steps that aren't ready tobe retried. */{auto step_(step->state.lock());if (step_->tries > 0 && step_->after > now) {if (step_->after < sleepUntil)sleepUntil = step_->after;++i;continue;} - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 870
dispatcherWakeup.wait(lock);printMsg(lvlDebug, format("dispatcher sleeping for %1%s") %std::chrono::duration_cast<std::chrono::seconds>(sleepUntil - std::chrono::system_clock::now()).count());dispatcherWakeup.wait_until(lock, sleepUntil);nrDispatcherWakeups++; - edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 911
bool retry = true; - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 915
doBuildStep(store, step, reservation->machine);retry = doBuildStep(store, step, reservation->machine); - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 917
printMsg(lvlError, format("error building ‘%1%’: %2%") % step->drvPath % e.what());// FIXME: put step back in runnable and retryprintMsg(lvlError, format("uncaught exception building ‘%1%’ on ‘%2%’: %3%")% step->drvPath % reservation->machine->sshName % e.what()); - edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 925
/* If there was a temporary failure, retry the step after anexponentially increasing interval. */if (retry) {{auto step_(step->state.lock());step_->tries++;nrRetries++;if (step_->tries > maxNrRetries) maxNrRetries = step_->tries; // yeah yeah, not atomicint delta = retryInterval * powf(retryBackoff, step_->tries - 1);printMsg(lvlInfo, format("will retry ‘%1%’ after %2%s") % step->drvPath % delta);step_->after = std::chrono::system_clock::now() + std::chrono::seconds(delta);}makeRunnable(step);} - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 944
void State::doBuildStep(std::shared_ptr<StoreAPI> store, Step::ptr step,bool State::doBuildStep(std::shared_ptr<StoreAPI> store, Step::ptr step, - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 961
are gone (e.g. cancelled). So don't bother. (This isare gone (e.g. cancelled). So don't bother. This is - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 964
Build). FIXME: what if a new Build gets a reference tothis step? */Build. However, it's possible that a new Build justcreated a reference to this step. So to handle thatpossibility, we retry this step (putting it back inthe runnable queue). If there are really no strongpointers to the step, it will be deleted. */ - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 970
destroyStep(step, false);return;return true; - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 990
/* If any of the outputs have previously failed, then don'tretry. *//* If any of the outputs have previously failed, then don't botherbuilding again. */ - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 1012
printMsg(lvlError, format("ERROR: %1%") % e.msg());abort(); // FIXMEprintMsg(lvlError, format("irregular failure building ‘%1%’ on ‘%2%’: %3%")% step->drvPath % machine->sshName % e.msg()); - edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 1023
bool retry = false;if (result.status == RemoteResult::rrMiscFailure) {auto step_(step->state.lock());retry = step_->tries + 1 < maxTries;} - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 1035
{if (!retry) { - edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 1068
BuildStatus buildStatus =result.status == RemoteResult::rrPermanentFailure ? bsFailed : bsAborted;BuildStepStatus buildStepStatus =result.status == RemoteResult::rrPermanentFailure ? bssFailed : bssAborted; - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 1075
if (result.status != RemoteResult::rrMiscFailure) result.errorMsg = "";if (buildStatus != bsAborted) result.errorMsg = ""; - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 1077
if (!cachedFailure) {if (!cachedFailure && !retry) { - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 1083
createBuildStep(txn, 0, build2, step, machine->sshName, bssFailed, result.errorMsg, build->id);createBuildStep(txn, 0, build2, step, machine->sshName,buildStepStatus, result.errorMsg, build->id); - edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 1087
finishBuildStep(txn, result.startTime, result.stopTime, build->id, stepNr, machine->sshName, bssFailed, result.errorMsg); - edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 1088
if (!cachedFailure)finishBuildStep(txn, result.startTime, result.stopTime, build->id,stepNr, machine->sshName, buildStepStatus, result.errorMsg); - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 1094[6.17060]→[6.9094:9139](∅→∅),[6.9139]→[6.14279:14367](∅→∅),[6.9139]→[6.17101:17135](∅→∅),[6.14367]→[6.17101:17135](∅→∅),[6.17101]→[6.17101:17135](∅→∅),[6.17135]→[6.2286:2434](∅→∅),[6.694]→[6.17272:17393](∅→∅),[6.2434]→[6.17272:17393](∅→∅),[6.17272]→[6.17272:17393](∅→∅),[6.17393]→[6.14368:14407](∅→∅),[6.14407]→[6.2435:2525](∅→∅),[6.2525]→[6.17464:17548](∅→∅),[6.14453]→[6.17464:17548](∅→∅),[6.17464]→[6.17464:17548](∅→∅)
for (auto build2 : dependents) {printMsg(lvlError, format("marking build %1% as failed") % build2->id);txn.parameterized("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $4, isCachedBuild = $5 where id = $1")(build2->id)((int) (build2->drvPath == step->drvPath ? bsFailed : bsDepFailed))(result.startTime)(result.stopTime)(cachedFailure ? 1 : 0).exec();build2->finishedInDB = true; // FIXME: txn might fail}if (!retry)for (auto build2 : dependents) {printMsg(lvlError, format("marking build %1% as failed") % build2->id);txn.parameterized("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $4, isCachedBuild = $5 where id = $1")(build2->id)((int) (build2->drvPath != step->drvPath && buildStatus == bsFailed ? bsDepFailed : buildStatus))(result.startTime)(result.stopTime)(cachedFailure ? 1 : 0).exec();build2->finishedInDB = true; // FIXME: txn might fail} - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 1121[6.9391]→[6.9391:9426](∅→∅),[6.9426]→[6.14454:14538](∅→∅),[6.14538]→[6.9478:9569](∅→∅),[6.9478]→[6.9478:9569](∅→∅)
for (auto build2 : dependents)if (build2->toplevel == step || result.status != RemoteResult::rrSuccess) {auto builds_(builds.lock());builds_->erase(build2->id);}if (!retry)for (auto build2 : dependents)if (build2->toplevel == step || result.status != RemoteResult::rrSuccess) {auto builds_(builds.lock());builds_->erase(build2->id);} - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 1131
destroyStep(step, result.status == RemoteResult::rrSuccess);if (!retry)destroyStep(step, result.status == RemoteResult::rrSuccess);return retry; - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 1202
printMsg(lvlError, format("%1% times woken up to check the queue") % nrQueueWakeups);printMsg(lvlError, format("%1% build step retries") % nrRetries);printMsg(lvlError, format("%1% most retries for any build step") % maxNrRetries);printMsg(lvlError, format("%1% queue wakeups") % nrQueueWakeups);printMsg(lvlError, format("%1% dispatcher wakeups") % nrDispatcherWakeups);printMsg(lvlError, format("%1% database connections") % dbPool.count()); - edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 1229
//printMsg(lvlInfo, "exiting...");//printMsg(lvlInfo, format("psql connections = %1%") % dbPool.count()); - replacement in src/root/build.tt at line 54
<span class="error">Aborted</span><span class="error"><strong>Aborted</strong>[% IF step.errormsg %]: [% HTML.escape(step.errormsg); END %]</span> - edit in src/sql/hydra.sql at line 375[23.1109][23.1109]
-- FIXME: remove