Fix race between the queue monitor and the builder threads
[?]
Jun 18, 2015, 2:30 PM
HUUZFPPKGHTXFZMZCO2UGWYNGEED3E2CFHQRFQVVBJGPQVGVY4UACDependencies
- [2]
HLSHCK3CSupport requiredSystemFeatures - [3]
NNOCZ4ROhydra-queue-runner: Improve dispatcher - [4]
UPN2SAMLAcquire exclusive table lock earlier - [5]
TM6WKSP3hydra-queue-runner: Set isCachedBuild - [6]
QJRDO2B4Simplify retry handling - [7]
RYTQLATYKeep track of failed paths in the Hydra database - [8]
22LDPAIPCheck non-runnable steps for unsupported system type - [9]
NJJ7H64SVery basic multi-threaded queue runner - [10]
YZAI5GQUImplement a database connection pool - [11]
2IQRXLWESupport cancelling builds - [12]
HHOMBU7Ghydra-queue-runner: Implement timeouts - [13]
5AIYUMTBBasic remote building - [14]
KBZHIGLGRecord the machine used for a build step - [15]
62MQPRXCPass null values to libpqxx properly - [16]
OCZ4LSGGAutomatically retry aborted builds - [17]
PQFOMNTLhydra-queue-runner: More stats - [18]
IWB3F4Z6Fail builds with previously failed steps early - [19]
JAUB2FT5getQueuedBuilds(): Handle dependent builds first - [20]
LJILHOJ7Create BuildSteps race-free - [21]
ENXUSMSVMake concurrency more robust - [22]
GKZN4UV7Make the queue monitor more robust, and better debug output - [23]
24BMQDZAStart of single-process hydra-queue-runner - [*]
N5O7VEEOImmediately abort builds that require an unsupported system type - [*]
UQQ4IL55Add a error type for "unsupported system type"
Change contents
- edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 122
bool finishedInDB; - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 123
Build() : finishedInDB(false) { }std::atomic_bool finishedInDB{false}; - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 159
Sync<State> state;std::atomic_bool created{false}; // debuggingstd::atomic_bool finished{false}; // debugging - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 162
std::atomic_bool destroyed;Sync<State> state; - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 164[7.5886]→[7.5886:5920](∅→∅),[7.657]→[7.5993:5994](∅→∅),[7.5920]→[7.5993:5994](∅→∅),[7.5993]→[7.5993:5994](∅→∅),[7.5994]→[7.1434:1450](∅→∅)
Step() : destroyed(false) { }~Step() { }~Step(){printMsg(lvlError, format("destroying step %1%") % drvPath);} - edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 283
Build::ptr referringBuild, Step::ptr referringStep, - edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 286
void destroyStep(Step::ptr step, bool proceed);/* Get the builds that depend on the given step. */std::set<Build::ptr> getDependentBuilds(Step::ptr step); - edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 524
assert(!build->finishedInDB); - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 538
Step::ptr step = createStep(store, build->drvPath, newSteps, newRunnable);Step::ptr step = createStep(store, build->drvPath, build, 0, newSteps, newRunnable); - edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 560[7.9878]→[7.971:1064](∅→∅),[7.1064]→[7.11213:11214](∅→∅),[7.2064]→[7.11213:11214](∅→∅),[7.9949]→[7.11213:11214](∅→∅),[7.11213]→[7.11213:11214](∅→∅)
printMsg(lvlInfo, format("marking build %1% as cached successful") % build->id); - edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 565
build->finishedInDB = true; - edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 603[4.72][26.310]
assert(!build->finishedInDB); - edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 625
auto step_(step->state.lock()); - edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 626
step_->builds.push_back(build); - edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 631[7.442]→[7.10110:10111](∅→∅),[7.1125]→[7.10110:10111](∅→∅),[7.10110]→[7.10110:10111](∅→∅),[7.10111]→[7.3558:3806](∅→∅),[7.3558]→[7.3558:3806](∅→∅)
/* Prior to this, the build is not visible togetDependentBuilds(). Now it is, so the build can befailed if a dependency fails. (It can't succeed right awaybecause its top-level is not runnable yet). */ - edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 680
Build::ptr referringBuild, Step::ptr referringStep, - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 683
/* Check if the requested step already exists. *//* Check if the requested step already exists. If not, create anew step. In any case, make the step reachable fromreferringBuild or referringStep. This is done atomically (with‘steps’ locked), to ensure that this step can never becomereachable from a new build after doBuildStep has removed itfrom ‘steps’. */Step::ptr step;bool isNew = false; - edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 693
/* See if the step already exists in ‘steps’ and is notstale. */ - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 698
auto step = prev->second.lock();step = prev->second.lock(); - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 701
if (step) return step;steps_->erase(drvPath); // remove stale entryif (!step) steps_->erase(drvPath); // remove stale entry}/* If it doesn't exist, create it. */if (!step) {step = std::make_shared<Step>();step->drvPath = drvPath;isNew = true; - edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 710
auto step_(step->state.lock());if (referringBuild)step_->builds.push_back(referringBuild);if (referringStep)step_->rdeps.push_back(referringStep);(*steps_)[drvPath] = step; - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 724
auto step = std::make_shared<Step>();step->drvPath = drvPath;if (!isNew) {assert(step->created);return step;}/* Initialize the step. Note that the step may be visible in‘steps’ before this point, but that doesn't matter becauseit's not runnable yet, and other threads won't make itrunnable while step->created == false. */ - edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 739
newSteps.insert(step); - edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 754
newSteps.insert(step); - edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 757
bool hasDeps = false; - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 758
Step::ptr dep = createStep(store, i.first, newSteps, newRunnable);auto dep = createStep(store, i.first, 0, step, newSteps, newRunnable); - edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 760
hasDeps = true; - edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 761
auto dep_(dep->state.lock()); - edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 762
dep_->rdeps.push_back(step); - edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 763[7.12479]→[7.12479:12486](∅→∅),[7.12486]→[7.4889:5027](∅→∅),[7.5027]→[7.12513:12514](∅→∅),[7.12513]→[7.12513:12514](∅→∅),[7.12514]→[7.5028:5072](∅→∅),[7.2250]→[7.12565:12643](∅→∅),[7.5072]→[7.12565:12643](∅→∅),[7.12565]→[7.12565:12643](∅→∅),[7.12643]→[7.10112:10174](∅→∅),[7.10174]→[7.1431:1514](∅→∅),[7.1514]→[7.5155:5156](∅→∅),[7.5155]→[7.5155:5156](∅→∅),[7.5156]→[7.352:372](∅→∅),[7.372]→[7.5156:5235](∅→∅),[7.5156]→[7.5156:5235](∅→∅)
}{auto steps_(steps.lock());assert(steps_->find(drvPath) == steps_->end());(*steps_)[drvPath] = step;}if (!hasDeps) newRunnable.insert(step);return step;}void State::destroyStep(Step::ptr step, bool proceed){if (step->destroyed) return;step->destroyed = true;printMsg(lvlDebug, format("destroying build step ‘%1%’") % step->drvPath);nrStepsDone++;{auto steps_(steps.lock());steps_->erase(step->drvPath); - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 765
std::vector<Step::wptr> rdeps;/* If the step has no (remaining) dependencies, make itrunnable. */ - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 769[7.5324]→[7.5324:5354](∅→∅),[7.5354]→[7.12675:12676](∅→∅),[7.12675]→[7.12675:12676](∅→∅),[7.12676]→[7.5355:5608](∅→∅)
rdeps = step_->rdeps;/* Sanity checks. */for (auto & build_ : step_->builds) {auto build = build_.lock();if (!build) continue;assert(build->drvPath == step->drvPath);assert(build->finishedInDB);}assert(!step->created);step->created = true;if (step_->deps.empty())newRunnable.insert(step); - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 775[7.5615]→[7.5615:5648](∅→∅),[7.5648]→[7.12715:12778](∅→∅),[7.12715]→[7.12715:12778](∅→∅),[7.12778]→[7.5649:5879](∅→∅),[7.5879]→[7.12869:12991](∅→∅),[7.12869]→[7.12869:12991](∅→∅),[7.12991]→[7.5880:5906](∅→∅),[7.5906]→[7.2251:2287](∅→∅),[7.13027]→[7.2251:2287](∅→∅),[7.2287]→[7.13066:13081](∅→∅),[7.13066]→[7.13066:13081](∅→∅),[7.13081]→[7.5907:6020](∅→∅),[7.6020]→[7.13177:13221](∅→∅),[7.13177]→[7.13177:13221](∅→∅)
for (auto & rdep_ : rdeps) {auto rdep = rdep_.lock();if (!rdep) continue;bool runnable = false;{auto rdep_(rdep->state.lock());assert(has(rdep_->deps, step));rdep_->deps.erase(step);if (rdep_->deps.empty()) runnable = true;}if (proceed) {/* If this rdep has no other dependencies, then we can nowbuild it. */if (runnable)makeRunnable(rdep);} else/* If ‘step’ failed or was cancelled, then delete alldependent steps as well. */destroyStep(rdep, false);}return step; - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 779
std::set<Build::ptr> State::getDependentBuilds(Step::ptr step)/* Get the steps and unfinished builds that depend on the given step. */void getDependents(Step::ptr step, std::set<Build::ptr> & builds, std::set<Step::ptr> & steps) - edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 782
std::set<Step::ptr> done;std::set<Build::ptr> res; - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 785
if (has(done, step)) return;done.insert(step);if (has(steps, step)) return;steps.insert(step); - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 795
if (build_) res.insert(build_);if (build_ && !build_->finishedInDB) builds.insert(build_); - edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 810
return res; - edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 819
assert(step->created);assert(!step->finished); - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 884
printMsg(lvlDebug, format("%1% runnable builds") % runnable_->size());//printMsg(lvlDebug, format("%1% runnable builds") % runnable_->size()); - edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 995
{auto step_(step->state.lock());assert(step->created);assert(!step->finished);} - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 1011
auto dependents = getDependentBuilds(step);std::set<Build::ptr> dependents;std::set<Step::ptr> steps;getDependents(step, dependents, steps); - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 1096[7.1530]→[7.8142:8563](∅→∅),[7.4555]→[7.8142:8563](∅→∅),[7.16048]→[7.8142:8563](∅→∅),[7.8563]→[6.510:575](∅→∅)
/* Remove this step. After this, incoming builds that depend ondrvPath will either see that the output paths exist, or willcreate a new build step for drvPath. The latter is fine - itwon't conflict with this one, because we're removing it. In anycase, the set of dependent builds for ‘step’ can't increaseanymore because ‘step’ is no longer visible to createStep(). */auto steps_(steps.lock());steps_->erase(step->drvPath);if (result.status == RemoteResult::rrSuccess) {/* Register success in the database for all Build objects thathave this step as the top-level step. Since the queuemonitor thread may be creating new referring Buildsconcurrently, and updating the database may fail, we dothis in a loop, marking all known builds, repeating untilthere are no unmarked builds.*/while (true) {/* Get the builds that have this one as the top-level. */std::vector<Build::ptr> direct;{auto steps_(steps.lock());auto step_(step->state.lock());for (auto & b_ : step_->builds) {auto b = b_.lock();if (b && !b->finishedInDB) direct.push_back(b);}/* If there are no builds left to update in the DB,then we're done. Delete the step from‘steps’. Since we've been holding the ‘steps’ lock,no new referrers can have been added in themeantime or be added afterwards. */if (direct.empty()) {printMsg(lvlDebug, format("finishing build step ‘%1%’") % step->drvPath);nrStepsDone++;steps_->erase(step->drvPath);break;}}/* Update the database. */{pqxx::work txn(*conn);finishBuildStep(txn, result.startTime, result.stopTime, build->id, stepNr, machine->sshName, bssSuccess);for (auto & b : direct)markSucceededBuild(txn, b, res, build != b,result.startTime, result.stopTime); - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 1141
/* Get the final set of dependent builds. */auto dependents = getDependentBuilds(step);txn.commit();} - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 1144[7.16116]→[7.8747:8780](∅→∅),[7.8780]→[7.16116:16122](∅→∅),[7.16116]→[7.16116:16122](∅→∅),[7.16122]→[7.8781:8953](∅→∅)
std::set<Build::ptr> direct;{auto step_(step->state.lock());for (auto & build : step_->builds) {auto build_ = build.lock();if (build_) direct.insert(build_);/* Remove the direct dependencies from ‘builds’. This willcause them to be destroyed. */for (auto & b : direct) {auto builds_(builds.lock());b->finishedInDB = true;builds_->erase(b->id);} - edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 1152
} - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 1153
/* Update the database. */{pqxx::work txn(*conn);/* Wake up any dependent steps that have no otherdependencies. */{auto step_(step->state.lock());for (auto & rdepWeak : step_->rdeps) {auto rdep = rdepWeak.lock();if (!rdep) continue; - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 1161
if (result.status == RemoteResult::rrSuccess) {bool runnable = false;{auto rdep_(rdep->state.lock());rdep_->deps.erase(step);if (rdep_->deps.empty()) runnable = true;} - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 1168
finishBuildStep(txn, result.startTime, result.stopTime, build->id, stepNr, machine->sshName, bssSuccess);if (runnable) makeRunnable(rdep);}} - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 1172
/* Mark all builds of which this derivation is the toplevel as succeeded. */for (auto build2 : direct)markSucceededBuild(txn, build2, res, build != build2,result.startTime, result.stopTime);} else { - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 1174
} else {/* Failure case. *//* Register failure in the database for all Build objects thatdirectly or indirectly depend on this step. */ - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 1177[7.200]→[7.4575:4613](∅→∅),[7.4613]→[7.1238:1417](∅→∅),[7.1417]→[7.4703:4749](∅→∅),[7.4703]→[7.4703:4749](∅→∅),[7.4749]→[7.1418:1600](∅→∅)
BuildStatus buildStatus =result.status == RemoteResult::rrPermanentFailure ? bsFailed :result.status == RemoteResult::rrTimedOut ? bsTimedOut :bsAborted;BuildStepStatus buildStepStatus =result.status == RemoteResult::rrPermanentFailure ? bssFailed :result.status == RemoteResult::rrTimedOut ? bssTimedOut :bssAborted;while (true) { - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 1179
/* For regular failures, we don't care about the errormessage. */if (buildStatus != bsAborted) result.errorMsg = "";/* Get the builds and steps that depend on this step. */std::set<Build::ptr> indirect;{auto steps_(steps.lock());std::set<Step::ptr> steps;getDependents(step, indirect, steps); - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 1186
/* Create failed build steps for every build that dependson this. For cached failures, only create a step forbuilds that don't have this step as top-level(otherwise the user won't be able to see what causedthe build to fail). */for (auto build2 : dependents) {if (build == build2) continue;if (cachedFailure && build2->drvPath == step->drvPath) continue;createBuildStep(txn, 0, build2, step, machine->sshName,buildStepStatus, result.errorMsg, build->id);/* If there are no builds left, delete all referringsteps from ‘steps’. As for the success case, we canbe certain no new referrers can be added. */if (indirect.empty()) {for (auto & s : steps) {printMsg(lvlDebug, format("finishing build step ‘%1%’") % step->drvPath);nrStepsDone++;steps_->erase(s->drvPath);}break;} - edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 1198
/* Update the database. */{pqxx::work txn(*conn); - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 1203
if (!cachedFailure)finishBuildStep(txn, result.startTime, result.stopTime, build->id,stepNr, machine->sshName, buildStepStatus, result.errorMsg);BuildStatus buildStatus =result.status == RemoteResult::rrPermanentFailure ? bsFailed :result.status == RemoteResult::rrTimedOut ? bsTimedOut :bsAborted;BuildStepStatus buildStepStatus =result.status == RemoteResult::rrPermanentFailure ? bssFailed :result.status == RemoteResult::rrTimedOut ? bssTimedOut :bssAborted;/* For regular failures, we don't care about the errormessage. */if (buildStatus != bsAborted) result.errorMsg = "";/* Create failed build steps for every build that dependson this. For cached failures, only create a step forbuilds that don't have this step as top-level(otherwise the user won't be able to see what causedthe build to fail). */for (auto & build2 : indirect) {if (build == build2) continue;if (cachedFailure && build2->drvPath == step->drvPath) continue;createBuildStep(txn, 0, build2, step, machine->sshName,buildStepStatus, result.errorMsg, build->id);}if (!cachedFailure)finishBuildStep(txn, result.startTime, result.stopTime, build->id,stepNr, machine->sshName, buildStepStatus, result.errorMsg);/* Mark all builds that depend on this derivation as failed. */for (auto & build2 : indirect) {printMsg(lvlError, format("marking build %1% as failed") % build2->id);assert(!build->finishedInDB);txn.parameterized("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $4, isCachedBuild = $5 where id = $1")(build2->id)((int) (build2->drvPath != step->drvPath && buildStatus == bsFailed ? bsDepFailed : buildStatus))(result.startTime)(result.stopTime)(cachedFailure ? 1 : 0).exec();nrBuildsDone++;}/* Remember failed paths in the database so that theywon't be built again. */if (!cachedFailure && result.status == RemoteResult::rrPermanentFailure)for (auto & path : outputPaths(step->drv))txn.parameterized("insert into FailedPaths values ($1)")(path).exec(); - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 1252
/* Mark all builds that depend on this derivation as failed. */for (auto build2 : dependents) {printMsg(lvlError, format("marking build %1% as failed") % build2->id);txn.parameterized("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $4, isCachedBuild = $5 where id = $1")(build2->id)((int) (build2->drvPath != step->drvPath && buildStatus == bsFailed ? bsDepFailed : buildStatus))(result.startTime)(result.stopTime)(cachedFailure ? 1 : 0).exec();build2->finishedInDB = true; // FIXME: txn might failnrBuildsDone++;txn.commit(); - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 1255
/* Remember failed paths in the database so that theywon't be built again. */if (!cachedFailure && result.status == RemoteResult::rrPermanentFailure)for (auto & path : outputPaths(step->drv))txn.parameterized("insert into FailedPaths values ($1)")(path).exec();/* Remove the indirect dependencies from ‘builds’. Thiswill cause them to be destroyed. */for (auto & b : indirect) {auto builds_(builds.lock());b->finishedInDB = true;builds_->erase(b->id);} - edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 1264
txn.commit(); - edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 1265
/* In case of success, destroy all Build objects of which ‘step’is the top-level derivation. In case of failure, destroy alldependent Build objects. Any Steps not referenced by otherBuilds will be destroyed as well. */for (auto build2 : dependents)if (build2->toplevel == step || result.status != RemoteResult::rrSuccess) {auto builds_(builds.lock());builds_->erase(build2->id);} - edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 1266[7.9570]→[7.9570:9718](∅→∅),[7.9718]→[6.2116:2181](∅→∅),[6.2181]→[7.6378:6379](∅→∅),[7.6378]→[7.6378:6379](∅→∅)
/* Remove the step from the graph. In case of success, makedependent build steps runnable if they have no otherdependencies. */destroyStep(step, result.status == RemoteResult::rrSuccess); - edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 1274
assert(!build->finishedInDB); - edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 1304
build->finishedInDB = true; // FIXME: txn might fail