Kill active build steps when builds are cancelled
[?]
Oct 31, 2016, 1:58 PM
LVQXQIYA7QMLVYOANYEFHDBTFAOSE3D2IYAVOG2DXURTASRCUNYQCDependencies
- [2]
TTBLPQAJKeep track of wait time per system type - [3]
OTNJLJHASort build steps - [4]
UYUVQWXQFix hydra-queue-runner --build-one - [5]
KQ3EGUQYAdd some instrumentation to keep track of dispatcher cost - [6]
UNVLTCV4Fix showing machine name for aborted build steps - [7]
EHEQ4AY3Fix retry of transient failures - [8]
OPN3PED2Tweak - [9]
3BKPZ52CDisambiguate "marking build as succeeded" message - [10]
YTAYNN7VQueue monitor: Bail out earlier if a step has failed previously - [11]
NTEDD7T4Provide a plugin hook for when build steps finish - [12]
OBOTGFG6Prevent orphaned build steps - [13]
BRAESISHWarn if PostgreSQL appears stalled - [14]
PH3DFCNURender machine correctly if it doesn't contain @ - [15]
TX7Q4RASAdd page showing latest build steps - [16]
2GUAKGTBFix indentation of build.tt - [17]
ZH6B56XRTry harder to find build logs - [18]
HJOEIMLRRefactor - [19]
MHVIT4JYSplit hydra-queue-runner.cc more - [20]
MSIHMO45Tweak build steps - [21]
EYR3EW6JKeep stats for the Hydra auto scaler - [22]
LE4VZIY5More stats - [23]
73YR46NJhydra-queue-runner: Write directly to a binary cache - [24]
UNVMKJV5Unify build and step status codes - [25]
R7MDDCB2Some unnecessary job names - [26]
7LWB2J2ZPeriodically clear orphaned build steps - [27]
FCTX433OAdd buildStarted plugin hook - [28]
O64P4XJSKeep per-machine stats - [29]
FQQRJUO4Mark builds as busy - [30]
PMNWRTGJAdd multiple output support - [31]
VQISTKOPhydra-queue-runner: Use substitutes - [32]
TPNHTE5VRemove obsolete Builds columns and provide accurate "Running builds" - [33]
7LFMSF4KDon't show "localhost" as machine for cached failed build steps - [34]
5JB5DKQLDon't repeat links to build step logs - [35]
DKJFD6JNProcess Nix API changes - [36]
BD3GRK4B* Get rid of "positive failures" and separate log phases. - [37]
24BMQDZAStart of single-process hydra-queue-runner - [38]
PLOZBRTRAdd command ‘hydra-queue-runner --status’ to show current status - [39]
HUUZFPPKFix race between the queue monitor and the builder threads - [40]
FITVNQ2SKeep track of the time we spend copying to/from build machines - [41]
5AIYUMTBBasic remote building - [42]
BG6PEOB2Make the output size limit configurable - [43]
62MQPRXCPass null values to libpqxx properly - [44]
KBZHIGLGRecord the machine used for a build step - [45]
LJILHOJ7Create BuildSteps race-free - [46]
WE5Q2NVIAllow build to be bumped to the front of the queue via the web interface - [47]
NKQOEVVPGet rid of "will retry" messages after "maybe cancelling..." - [48]
DWFTK56EKeep track of how many threads are waiting - [49]
UQQ4IL55Add a error type for "unsupported system type" - [50]
NQ2X3Y4KDon't render machine name if not applicable to step - [*]
J5UVLXOK* Start of a basic Catalyst web interface. - [*]
OCZ4LSGGAutomatically retry aborted builds - [*]
JGLE5BRNAdd separate build step status codes for cached failures and timeouts - [*]
N22GPKYT* Put info about logs / build products in the DB.
Change contents
- edit in src/hydra-queue-runner/builder.cc at line 15
reservation->threadId = pthread_self(); - replacement in src/hydra-queue-runner/builder.cc at line 18
MaintainCount mc(nrActiveSteps);activeSteps_.lock()->insert(reservation);Finally removeActiveStep([&]() {reservation->threadId = -1;activeSteps_.lock()->erase(reservation);}); - replacement in src/hydra-queue-runner/builder.cc at line 73
thousands of builds), so we don't. */Build::ptr build;thousands of builds), so we don't.We don't keep a Build::ptr here to allowState::processQueueChange() to detect whether a step can becancelled (namely if there are no more Builds referring toit). */BuildID buildId;Path buildDrvPath;unsigned int maxSilentTime, buildTimeout; - edit in src/hydra-queue-runner/builder.cc at line 102
Build::ptr build; - edit in src/hydra-queue-runner/builder.cc at line 112
buildId = build->id;buildDrvPath = build->drvPath;maxSilentTime = build->maxSilentTime;buildTimeout = build->buildTimeout; - replacement in src/hydra-queue-runner/builder.cc at line 118
% step->drvPath % machine->sshName % build->id % (dependents.size() - 1));% step->drvPath % machine->sshName % buildId % (dependents.size() - 1)); - replacement in src/hydra-queue-runner/builder.cc at line 121
bool quit = build->id == buildOne && step->drvPath == build->drvPath;bool quit = buildId == buildOne && step->drvPath == buildDrvPath; - replacement in src/hydra-queue-runner/builder.cc at line 132
printError("marking step %d of build %d as orphaned", stepNr, build->id);printError("marking step %d of build %d as orphaned", stepNr, buildId); - replacement in src/hydra-queue-runner/builder.cc at line 134
orphanedSteps_->emplace(build->id, stepNr);orphanedSteps_->emplace(buildId, stepNr); - replacement in src/hydra-queue-runner/builder.cc at line 151
stepNr = createBuildStep(txn, result.startTime, build, step, machine->sshName, bsBusy);stepNr = createBuildStep(txn, result.startTime, buildId, step, machine->sshName, bsBusy); - replacement in src/hydra-queue-runner/builder.cc at line 158
buildRemote(destStore, machine, step, build->maxSilentTime, build->buildTimeout, result);buildRemote(destStore, machine, step, maxSilentTime, buildTimeout, result); - edit in src/hydra-queue-runner/builder.cc at line 165
} catch (__cxxabiv1::__forced_unwind & e) {/* The queue monitor thread cancelled this step. */try {printInfo("marking step %d of build %d as succeeded", stepNr, buildId);pqxx::work txn(*conn);finishBuildStep(txn, result.startTime, time(0), result.overhead, buildId,stepNr, machine->sshName, bsCancelled, "");txn.commit();stepFinished = true;} catch (...) {ignoreException();}throw; - replacement in src/hydra-queue-runner/builder.cc at line 204
logCompressorQueue_->push({build->id, stepNr, result.logFile});logCompressorQueue_->push({buildId, stepNr, result.logFile}); - replacement in src/hydra-queue-runner/builder.cc at line 224
finishBuildStep(txn, result.startTime, result.stopTime, result.overhead, build->id,finishBuildStep(txn, result.startTime, result.stopTime, result.overhead, buildId, - replacement in src/hydra-queue-runner/builder.cc at line 280
build->id, stepNr, machine->sshName, bsSuccess);buildId, stepNr, machine->sshName, bsSuccess); - replacement in src/hydra-queue-runner/builder.cc at line 284
markSucceededBuild(txn, b, res, build != b || result.isCached,markSucceededBuild(txn, b, res, buildId != b->id || result.isCached, - replacement in src/hydra-queue-runner/builder.cc at line 377
(result.stepStatus != bsCachedFailure && build == build2) ||(result.stepStatus != bsCachedFailure && buildId == build2->id) || - replacement in src/hydra-queue-runner/builder.cc at line 380
createBuildStep(txn, 0, build2, step, machine->sshName,result.stepStatus, result.errorMsg, build == build2 ? 0 : build->id);createBuildStep(txn, 0, build2->id, step, machine->sshName,result.stepStatus, result.errorMsg, buildId == build2->id ? 0 : buildId); - replacement in src/hydra-queue-runner/builder.cc at line 387
build->id, stepNr, machine->sshName, result.stepStatus, result.errorMsg);buildId, stepNr, machine->sshName, result.stepStatus, result.errorMsg); - replacement in src/hydra-queue-runner/builder.cc at line 429
notificationSenderQueue_->push(NotificationItem{NotificationItem::Type::BuildFinished, build->id, dependentIDs});notificationSenderQueue_->push(NotificationItem{NotificationItem::Type::BuildFinished, buildId, dependentIDs}); - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 227
unsigned int State::allocBuildStep(pqxx::work & txn, Build::ptr build)unsigned int State::allocBuildStep(pqxx::work & txn, BuildID buildId) - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 233
auto res = txn.parameterized("select max(stepnr) from BuildSteps where build = $1")(build->id).exec();auto res = txn.parameterized("select max(stepnr) from BuildSteps where build = $1")(buildId).exec(); - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 238
unsigned int State::createBuildStep(pqxx::work & txn, time_t startTime, Build::ptr build, Step::ptr step,unsigned int State::createBuildStep(pqxx::work & txn, time_t startTime, BuildID buildId, Step::ptr step, - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 241
unsigned int stepNr = allocBuildStep(txn, build);auto stepNr = allocBuildStep(txn, buildId); - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 245
(build->id)(buildId) - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 261
(build->id)(stepNr)(output.first)(output.second.path).exec();(buildId)(stepNr)(output.first)(output.second.path).exec(); - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 287
int stepNr = allocBuildStep(txn, build);auto stepNr = allocBuildStep(txn, build->id); - replacement in src/hydra-queue-runner/hydra-queue-runner.cc at line 577
root.attr("nrActiveSteps", nrActiveSteps);root.attr("nrActiveSteps", activeSteps_.lock()->size()); - edit in src/hydra-queue-runner/queue-monitor.cc at line 4
#include <cstring> - replacement in src/hydra-queue-runner/queue-monitor.cc at line 185
createBuildStep(txn, 0, build, ex.step, "", bsCachedFailure, "", propagatedFrom);createBuildStep(txn, 0, build->id, ex.step, "", bsCachedFailure, "", propagatedFrom); - replacement in src/hydra-queue-runner/queue-monitor.cc at line 317
auto builds_(builds.lock());{auto builds_(builds.lock()); - replacement in src/hydra-queue-runner/queue-monitor.cc at line 320[13.29045]→[13.29045:29106](∅→∅),[13.29106]→[13.1591:1672](∅→∅),[13.1672]→[13.29167:29360](∅→∅),[13.29167]→[13.29167:29360](∅→∅),[13.29360]→[13.1673:1695](∅→∅)
for (auto i = builds_->begin(); i != builds_->end(); ) {auto b = currentIds.find(i->first);if (b == currentIds.end()) {printMsg(lvlInfo, format("discarding cancelled build %1%") % i->first);i = builds_->erase(i);// FIXME: ideally we would interrupt active build steps here.continue;for (auto i = builds_->begin(); i != builds_->end(); ) {auto b = currentIds.find(i->first);if (b == currentIds.end()) {printMsg(lvlInfo, format("discarding cancelled build %1%") % i->first);i = builds_->erase(i);// FIXME: ideally we would interrupt active build steps here.continue;}if (i->second->globalPriority < b->second) {printMsg(lvlInfo, format("priority of build %1% increased") % i->first);i->second->globalPriority = b->second;i->second->propagatePriorities();}++i; - replacement in src/hydra-queue-runner/queue-monitor.cc at line 335
if (i->second->globalPriority < b->second) {printMsg(lvlInfo, format("priority of build %1% increased") % i->first);i->second->globalPriority = b->second;i->second->propagatePriorities();}{auto activeSteps(activeSteps_.lock());for (auto & activeStep : *activeSteps) {auto threadId = activeStep->threadId; // FIXME: use Sync or atomic?if (threadId == 0) continue;std::set<Build::ptr> dependents;std::set<Step::ptr> steps;getDependents(activeStep->step, dependents, steps);if (!dependents.empty()) continue;printInfo("cancelling thread for build step ‘%s’", activeStep->step->drvPath);int err = pthread_cancel(threadId);if (err)printError("error cancelling thread for build step ‘%s’: %s",activeStep->step->drvPath, strerror(err)); - edit in src/hydra-queue-runner/queue-monitor.cc at line 355
++i; - edit in src/hydra-queue-runner/state.hh at line 31
bsCancelled = 4, - edit in src/hydra-queue-runner/state.hh at line 300
counter nrActiveSteps{0}; - edit in src/hydra-queue-runner/state.hh at line 373
pthread_t threadId = 0;bool cancelled = false; - edit in src/hydra-queue-runner/state.hh at line 378
nix::Sync<std::set<std::shared_ptr<MachineReservation>>> activeSteps_; - replacement in src/hydra-queue-runner/state.hh at line 420
unsigned int allocBuildStep(pqxx::work & txn, Build::ptr build);unsigned int allocBuildStep(pqxx::work & txn, BuildID buildId); - replacement in src/hydra-queue-runner/state.hh at line 422
unsigned int createBuildStep(pqxx::work & txn, time_t startTime, Build::ptr build, Step::ptr step,unsigned int createBuildStep(pqxx::work & txn, time_t startTime, BuildID buildId, Step::ptr step, - replacement in src/root/build.tt at line 26[13.53]→[13.11819:11941](∅→∅),[3.85]→[13.11819:11941](∅→∅),[13.2133]→[13.11819:11941](∅→∅),[13.11819]→[13.11819:11941](∅→∅)
[% IF ( type == "All" ) || ( type == "Failed" && step.status != 0 ) || ( type == "Running" && step.busy == 1 ) %][% IF ( type == "All" ) || ( type == "Failed" && step.busy == 0 && step.status != 0 ) || ( type == "Running" && step.busy == 1 ) %] - replacement in src/root/build.tt at line 52
<td>[% IF step.busy == 1 || ((step.machine || step.starttime) && (step.status == 0 || step.status == 1 || step.status == 3 || step.status == 7)); INCLUDE renderMachineName machine=step.machine; ELSE; "<em>n/a</em>"; END %]</td><td>[% IF step.busy == 1 || ((step.machine || step.starttime) && (step.status == 0 || step.status == 1 || step.status == 3 || step.status == 4 || step.status == 7)); INCLUDE renderMachineName machine=step.machine; ELSE; "<em>n/a</em>"; END %]</td> - edit in src/root/build.tt at line 60
[% ELSIF step.status == 4 %]<span class="error">Cancelled</span> - replacement in src/root/build.tt at line 247
[% IF steps && build.buildstatus != 0 && build.buildstatus != 6 %][% IF steps && build.buildstatus != 0 && build.buildstatus != 4 && build.buildstatus != 6 %] - replacement in src/sql/hydra.sql at line 206
-- 4 = build cancelled (removed from queue; never built) [builds only]-- 4 = build or step cancelled