Aborted builds are now put back on the runnable queue and retried after a certain time interval (currently 60 seconds for the first retry, then tripled on each subsequent retry).
OCZ4LSGGSCMSLGC3C32D5JUYYHS5CIPOKOAMADEFAFZOFXJ3YY3AC
6TY4LNHHAUSZNFDN34CHUWMZSQN5I3LNOOSV457M44FDGZB76X4AC
MSIHMO45JO5V5ICZ7SKVWH6CLINKQY3UTA7274Q5OZCKPRX7SUNQC
GKZN4UV75GV7GEHQGBM2O6UHN6CVSHA7BMASCUQSDIDYVUKKZL7AC
RQUAATWBGEP3YT4F555XLJYRRRGHDTEILHFORES7AM2XAOVMVJSAC
5AIYUMTBY6TFQTBRP3MJ2PYWUMRF57I77NIVWYE74UMEVQMBWZVQC
NJJ7H64SZOX5EGACDCQAUQ7R6UEWD5IIC35A2MWFOOJV55DJYPHAC
24BMQDZAWDQ7VNIA7TIROXSOYLOJBNZ2E4264WHWNJAEN6ZB3UOAC
ENXUSMSVOU3AZFMH2ZXR4ZVPV2LRRQYQJ6IFX33YN6IH2ORSNSAAC
RYTQLATYOZ6ODIKYVJ63TC4OIQBXHSCV3NA2YD4NFP7443GQVSRQC
IWB3F4Z6QZYHQFJ6FWZTGWLCPBYEUTFLUS3F7QT7JOA4DV4YLYGAC
LJILHOJ7QYXNTMKDQS6OU4PVCXPQ27C5KMQWHDBELCYP7FG3ZFYAC
FQQRJUO4C7655SFAKPRPILALVEVRQSIIVBLMQUFPODUBXPIMWYSAC
T2EIYJNGPIANHKJ4HBJIPTINWKG7RDLHR3PVHFYAPPLHZAJQBVWAC
2GUAKGTBTNFFER343SQWSLFYIXXHJLDSGH5JHF7QMC3AVZB7Q3TQC
J5UVLXOK6EDIL5I7VKWH4V2QDS4DPD7FHRK6XBWSXFRQS4JKXFZQC
JGLE5BRNJTMCZFZ33G2SGOQ4F3CIZAPMU3IXCNMR5NNSG5EJLX6QC
4HPT4SDDNU24OX2P4FDP2KXKINDCJLZEBN2VIEEKWFEVT4TNWXZAC
N22GPKYTOLZLBGTGDATQDVZ4R5APZEAOIA7L32X4UXBH4XNI7MWAC
unsigned int magic = readInt(from);
if (magic != SERVE_MAGIC_2)
throw Error(format("protocol mismatch with ‘nix-store --serve’ on ‘%1%’") % sshName);
unsigned int version = readInt(from);
if (GET_PROTOCOL_MAJOR(version) != 0x200)
throw Error(format("unsupported ‘nix-store --serve’ protocol version on ‘%1%’") % sshName);
unsigned int magic = readInt(from);
if (magic != SERVE_MAGIC_2)
throw Error(format("protocol mismatch with ‘nix-store --serve’ on ‘%1%’") % sshName);
unsigned int version = readInt(from);
if (GET_PROTOCOL_MAJOR(version) != 0x200)
throw Error(format("unsupported ‘nix-store --serve’ protocol version on ‘%1%’") % sshName);
} catch (EndOfFile & e) {
child.pid.wait(true);
throw Error(format("cannot connect to ‘%1%’: %2%") % sshName % chomp(readFile(logFile)));
}
dispatcherWakeup.wait(lock);
printMsg(lvlDebug, format("dispatcher sleeping for %1%s") %
std::chrono::duration_cast<std::chrono::seconds>(sleepUntil - std::chrono::system_clock::now()).count());
dispatcherWakeup.wait_until(lock, sleepUntil);
nrDispatcherWakeups++;
printMsg(lvlError, format("error building ‘%1%’: %2%") % step->drvPath % e.what());
// FIXME: put step back in runnable and retry
printMsg(lvlError, format("uncaught exception building ‘%1%’ on ‘%2%’: %3%")
% step->drvPath % reservation->machine->sshName % e.what());
/* If there was a temporary failure, retry the step after an
exponentially increasing interval. */
if (retry) {
{
auto step_(step->state.lock());
step_->tries++;
nrRetries++;
if (step_->tries > maxNrRetries) maxNrRetries = step_->tries; // yeah yeah, not atomic
int delta = retryInterval * powf(retryBackoff, step_->tries - 1);
printMsg(lvlInfo, format("will retry ‘%1%’ after %2%s") % step->drvPath % delta);
step_->after = std::chrono::system_clock::now() + std::chrono::seconds(delta);
}
makeRunnable(step);
}
Build). FIXME: what if a new Build gets a reference to
this step? */
Build. However, it's possible that a new Build just
created a reference to this step. So to handle that
possibility, we retry this step (putting it back in
the runnable queue). If there are really no strong
pointers to the step, it will be deleted. */
for (auto build2 : dependents) {
printMsg(lvlError, format("marking build %1% as failed") % build2->id);
txn.parameterized
("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $4, isCachedBuild = $5 where id = $1")
(build2->id)
((int) (build2->drvPath == step->drvPath ? bsFailed : bsDepFailed))
(result.startTime)
(result.stopTime)
(cachedFailure ? 1 : 0).exec();
build2->finishedInDB = true; // FIXME: txn might fail
}
if (!retry)
for (auto build2 : dependents) {
printMsg(lvlError, format("marking build %1% as failed") % build2->id);
txn.parameterized
("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $4, isCachedBuild = $5 where id = $1")
(build2->id)
((int) (build2->drvPath != step->drvPath && buildStatus == bsFailed ? bsDepFailed : buildStatus))
(result.startTime)
(result.stopTime)
(cachedFailure ? 1 : 0).exec();
build2->finishedInDB = true; // FIXME: txn might fail
}
for (auto build2 : dependents)
if (build2->toplevel == step || result.status != RemoteResult::rrSuccess) {
auto builds_(builds.lock());
builds_->erase(build2->id);
}
if (!retry)
for (auto build2 : dependents)
if (build2->toplevel == step || result.status != RemoteResult::rrSuccess) {
auto builds_(builds.lock());
builds_->erase(build2->id);
}
printMsg(lvlError, format("%1% times woken up to check the queue") % nrQueueWakeups);
printMsg(lvlError, format("%1% build step retries") % nrRetries);
printMsg(lvlError, format("%1% most retries for any build step") % maxNrRetries);
printMsg(lvlError, format("%1% queue wakeups") % nrQueueWakeups);
printMsg(lvlError, format("%1% dispatcher wakeups") % nrDispatcherWakeups);
printMsg(lvlError, format("%1% database connections") % dbPool.count());