Abort unsupported build steps
[?]
Mar 26, 2020, 2:00 PM
NJXD2ABJHKH7MBAPEOXQRKC3RK2OVNRO4ND44SOZJECZGCGZK7KQCDependencies
- [2]
7KLHBDYAFix build - [3]
32KJOERMTurn hydra-notify into a daemon - [4]
WV4SSAIYBuild against nix-master - [5]
FITVNQ2SKeep track of the time we spend copying to/from build machines - [6]
UVQJBDHNMove log compression to a plugin - [7]
B7ENVLRShydra-queue-runner: Make build notification more reliable - [8]
BG6PEOB2Make the output size limit configurable - [9]
TPNHTE5VRemove obsolete Builds columns and provide accurate "Running builds" - [10]
N4IROACVMove buildRemote() into State - [11]
NAYQT2GThydra-queue-runner: Use cmdBuildDerivation - [12]
LVQXQIYAKill active build steps when builds are cancelled - [13]
DKJFD6JNProcess Nix API changes - [14]
7LWB2J2ZPeriodically clear orphaned build steps - [15]
BRAESISHWarn if PostgreSQL appears stalled - [16]
MJL3PDXIFix duplicate step_finished notification - [17]
HH3LID6LRe-implement log size limits - [18]
UNVMKJV5Unify build and step status codes - [19]
UVNTWTWGPrevent download of NARs we just uploaded - [20]
NKQOEVVPGet rid of "will retry" messages after "maybe cancelling..." - [21]
NTEDD7T4Provide a plugin hook for when build steps finish - [22]
MHVIT4JYSplit hydra-queue-runner.cc more - [23]
OBOTGFG6Prevent orphaned build steps - [*]
4I2HF4L3Unindent - [*]
IK2UBDAURevive jobset scheduling - [*]
24BMQDZAStart of single-process hydra-queue-runner - [*]
NWFDDRUGhydra-queue-runner: Limit concurrent database connections - [*]
HJOEIMLRRefactor - [*]
TTBLPQAJKeep track of wait time per system type - [*]
OG3Z3QGCNamespace cleanup - [*]
EYR3EW6JKeep stats for the Hydra auto scaler
Change contents
- replacement in src/hydra-queue-runner/builder.cc at line 379
} else {} elsefailStep(*conn, step, buildId, result, machine, stepFinished, quit); - replacement in src/hydra-queue-runner/builder.cc at line 382
/* Register failure in the database for all Build objects thatdirectly or indirectly depend on this step. */// FIXME: keep stats about aborted steps?nrStepsDone++;totalStepTime += stepStopTime - stepStartTime;totalStepBuildTime += result.stopTime - result.startTime;machine->state->nrStepsDone++;machine->state->totalStepTime += stepStopTime - stepStartTime;machine->state->totalStepBuildTime += result.stopTime - result.startTime; - replacement in src/hydra-queue-runner/builder.cc at line 390
std::vector<BuildID> dependentIDs;if (quit) exit(0); // testing hack; FIXME: this won't run plugins - replacement in src/hydra-queue-runner/builder.cc at line 392
while (true) {/* Get the builds and steps that depend on this step. */std::set<Build::ptr> indirect;{auto steps_(steps.lock());std::set<Step::ptr> steps;getDependents(step, indirect, steps);return sDone;} - edit in src/hydra-queue-runner/builder.cc at line 395[5.9781]→[5.9781:10074](∅→∅),[5.10074]→[4.5050:5195](∅→∅),[4.5195]→[5.10173:10246](∅→∅),[5.10173]→[5.10173:10246](∅→∅),[5.10273]→[5.10273:10305](∅→∅)
/* If there are no builds left, delete all referringsteps from ‘steps’. As for the success case, we canbe certain no new referrers can be added. */if (indirect.empty()) {for (auto & s : steps) {printMsg(lvlDebug, "finishing build step ‘%s’",localStore->printStorePath(s->drvPath));steps_->erase(s->drvPath);}}} - replacement in src/hydra-queue-runner/builder.cc at line 396
if (indirect.empty() && stepFinished) break;void State::failStep(Connection & conn,Step::ptr step,BuildID buildId,const RemoteResult & result,Machine::ptr machine,bool & stepFinished,bool & quit){/* Register failure in the database for all Build objects thatdirectly or indirectly depend on this step. */ - replacement in src/hydra-queue-runner/builder.cc at line 408
/* Update the database. */{auto mc = startDbUpdate();std::vector<BuildID> dependentIDs; - replacement in src/hydra-queue-runner/builder.cc at line 410
pqxx::work txn(*conn);while (true) {/* Get the builds and steps that depend on this step. */std::set<Build::ptr> indirect;{auto steps_(steps.lock());std::set<Step::ptr> steps;getDependents(step, indirect, steps); - replacement in src/hydra-queue-runner/builder.cc at line 418[5.11220]→[5.2802:3078](∅→∅),[5.3078]→[5.11545:11594](∅→∅),[5.11545]→[5.11545:11594](∅→∅),[5.11594]→[5.3079:3181](∅→∅),[5.3181]→[5.2138:2229](∅→∅),[5.2229]→[5.11736:11816](∅→∅),[5.3266]→[5.11736:11816](∅→∅),[5.11736]→[5.11736:11816](∅→∅),[5.11816]→[5.2230:2408](∅→∅)
/* Create failed build steps for every build thatdepends on this, except when this step is cachedand is the top-level of that build (since then it'sredundant with the build's isCachedBuild field). */for (auto & build2 : indirect) {if ((result.stepStatus == bsCachedFailure && build2->drvPath == step->drvPath) ||(result.stepStatus != bsCachedFailure && buildId == build2->id) ||build2->finishedInDB)continue;createBuildStep(txn, 0, build2->id, step, machine->sshName,result.stepStatus, result.errorMsg, buildId == build2->id ? 0 : buildId);/* If there are no builds left, delete all referringsteps from ‘steps’. As for the success case, we canbe certain no new referrers can be added. */if (indirect.empty()) {for (auto & s : steps) {printMsg(lvlDebug, "finishing build step ‘%s’",localStore->printStorePath(s->drvPath));steps_->erase(s->drvPath); - edit in src/hydra-queue-runner/builder.cc at line 427
}} - replacement in src/hydra-queue-runner/builder.cc at line 430[5.12212]→[5.12212:12527](∅→∅),[5.12527]→[5.0:190](∅→∅),[5.184]→[5.12696:12733](∅→∅),[5.190]→[5.12696:12733](∅→∅),[5.12696]→[5.12696:12733](∅→∅),[5.12733]→[5.3520:3660](∅→∅),[5.3660]→[5.12855:12940](∅→∅),[5.12855]→[5.12855:12940](∅→∅),[5.12940]→[5.3661:3740](∅→∅),[5.3740]→[5.12996:13050](∅→∅),[5.12996]→[5.12996:13050](∅→∅)
/* Mark all builds that depend on this derivation as failed. */for (auto & build2 : indirect) {if (build2->finishedInDB) continue;printMsg(lvlError, format("marking build %1% as failed") % build2->id);txn.parameterized("update Builds set finished = 1, buildStatus = $2, startTime = $3, stopTime = $4, isCachedBuild = $5, notificationPendingSince = $4 where id = $1 and finished = 0")(build2->id)((int) (build2->drvPath != step->drvPath && result.buildStatus() == bsFailed ? bsDepFailed : result.buildStatus()))(result.startTime)(result.stopTime)(result.stepStatus == bsCachedFailure ? 1 : 0).exec();nrBuildsDone++;}if (indirect.empty() && stepFinished) break; - replacement in src/hydra-queue-runner/builder.cc at line 432
/* Remember failed paths in the database so that theywon't be built again. */if (result.stepStatus != bsCachedFailure && result.canCache)for (auto & path : step->drv->outputPaths())txn.parameterized("insert into FailedPaths values ($1)")(localStore->printStorePath(path)).exec();/* Update the database. */{auto mc = startDbUpdate(); - replacement in src/hydra-queue-runner/builder.cc at line 436
txn.commit();}pqxx::work txn(conn); - replacement in src/hydra-queue-runner/builder.cc at line 438
stepFinished = true;/* Create failed build steps for every build thatdepends on this, except when this step is cachedand is the top-level of that build (since then it'sredundant with the build's isCachedBuild field). */for (auto & build : indirect) {if ((result.stepStatus == bsCachedFailure && build->drvPath == step->drvPath) ||((result.stepStatus != bsCachedFailure && result.stepStatus != bsUnsupported) && buildId == build->id) ||build->finishedInDB)continue;createBuildStep(txn,0, build->id, step, machine ? machine->sshName : "",result.stepStatus, result.errorMsg, buildId == build->id ? 0 : buildId);} - replacement in src/hydra-queue-runner/builder.cc at line 452
/* Remove the indirect dependencies from ‘builds’. Thiswill cause them to be destroyed. */for (auto & b : indirect) {auto builds_(builds.lock());b->finishedInDB = true;builds_->erase(b->id);dependentIDs.push_back(b->id);if (buildOne == b->id) quit = true;/* Mark all builds that depend on this derivation as failed. */for (auto & build : indirect) {if (build->finishedInDB) continue;printMsg(lvlError, format("marking build %1% as failed") % build->id);txn.parameterized("update Builds set finished = 1, buildStatus = $2, startTime = $3, stopTime = $4, isCachedBuild = $5, notificationPendingSince = $4 where id = $1 and finished = 0")(build->id)((int) (build->drvPath != step->drvPath && result.buildStatus() == bsFailed ? bsDepFailed : result.buildStatus()))(result.startTime)(result.stopTime)(result.stepStatus == bsCachedFailure ? 1 : 0).exec();nrBuildsDone++; - edit in src/hydra-queue-runner/builder.cc at line 465
} - replacement in src/hydra-queue-runner/builder.cc at line 466
/* Send notification about this build and its dependents. */{pqxx::work txn(*conn);notifyBuildFinished(txn, buildId, dependentIDs);/* Remember failed paths in the database so that theywon't be built again. */if (result.stepStatus != bsCachedFailure && result.canCache)for (auto & path : step->drv->outputPaths())txn.parameterized("insert into FailedPaths values ($1)")(localStore->printStorePath(path)).exec(); - edit in src/hydra-queue-runner/builder.cc at line 474
} - replacement in src/hydra-queue-runner/builder.cc at line 475
// FIXME: keep stats about aborted steps?nrStepsDone++;totalStepTime += stepStopTime - stepStartTime;totalStepBuildTime += result.stopTime - result.startTime;machine->state->nrStepsDone++;machine->state->totalStepTime += stepStopTime - stepStartTime;machine->state->totalStepBuildTime += result.stopTime - result.startTime;stepFinished = true; - replacement in src/hydra-queue-runner/builder.cc at line 477
if (quit) exit(0); // testing hack; FIXME: this won't run plugins/* Remove the indirect dependencies from ‘builds’. Thiswill cause them to be destroyed. */for (auto & b : indirect) {auto builds_(builds.lock());b->finishedInDB = true;builds_->erase(b->id);dependentIDs.push_back(b->id);if (buildOne == b->id) quit = true;}} - replacement in src/hydra-queue-runner/builder.cc at line 488
return sDone;/* Send notification about this build and its dependents. */{pqxx::work txn(conn);notifyBuildFinished(txn, buildId, dependentIDs);txn.commit();} - edit in src/hydra-queue-runner/dispatcher.cc at line 302
abortUnsupported(); - edit in src/hydra-queue-runner/dispatcher.cc at line 317
void State::abortUnsupported(){/* Make a copy of 'runnable' and 'machines' so we don't block themvery long. */auto runnable2 = *runnable.lock();auto machines2 = *machines.lock();system_time now = std::chrono::system_clock::now();auto now2 = time(0); - edit in src/hydra-queue-runner/dispatcher.cc at line 329
std::unordered_set<Step::ptr> aborted; - edit in src/hydra-queue-runner/dispatcher.cc at line 331
for (auto & wstep : runnable2) {auto step(wstep.lock());if (!step) continue;bool supported = false;for (auto & machine : machines2) {if (machine.second->supportsStep(step)) {step->state.lock()->lastSupported = now;supported = true;break;}}if (!supported&& std::chrono::duration_cast<std::chrono::seconds>(now - step->state.lock()->lastSupported).count() >= maxUnsupportedTime){printError("aborting unsupported build step '%s' (type '%s')",localStore->printStorePath(step->drvPath),step->systemType);aborted.insert(step);auto conn(dbPool.get());std::set<Build::ptr> dependents;std::set<Step::ptr> steps;getDependents(step, dependents, steps);/* Maybe the step got cancelled. */if (dependents.empty()) continue;/* Find the build that has this step as the top-level (ifany). */Build::ptr build;for (auto build2 : dependents) {if (build2->drvPath == step->drvPath)build = build2;}if (!build) build = *dependents.begin();bool stepFinished = false;bool quit = false;failStep(*conn, step, build->id,RemoteResult {.stepStatus = bsUnsupported,.errorMsg = fmt("unsupported system type '%s'",step->systemType),.startTime = now2,.stopTime = now2,},nullptr, stepFinished, quit);if (quit) exit(1);}}/* Clean up 'runnable'. */{auto runnable_(runnable.lock());for (auto i = runnable_->begin(); i != runnable_->end(); ) {if (aborted.count(i->lock()))i = runnable_->erase(i);else++i;}}} - edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 49
, maxUnsupportedTime(config->getIntOption("max_unsupported_time", 0)) - replacement in src/hydra-queue-runner/state.hh at line 71
BuildStatus buildStatus()BuildStatus buildStatus() const - edit in src/hydra-queue-runner/state.hh at line 201
/* The time that we last saw a machine that supports thisstep. */system_time lastSupported = std::chrono::system_clock::now(); - edit in src/hydra-queue-runner/state.hh at line 310
/* Time in seconds before unsupported build steps are aborted. */const unsigned int maxUnsupportedTime = 0; - edit in src/hydra-queue-runner/state.hh at line 492
void failStep(Connection & conn,Step::ptr step,BuildID buildId,const RemoteResult & result,Machine::ptr machine,bool & stepFinished,bool & quit); - edit in src/hydra-queue-runner/state.hh at line 516[29.8187][32.3028]
void abortUnsupported();