Abort unsupported build steps

[?]
Mar 26, 2020, 2:00 PM
NJXD2ABJHKH7MBAPEOXQRKC3RK2OVNRO4ND44SOZJECZGCGZK7KQC

Dependencies

  • [2] 7KLHBDYA Fix build
  • [3] 32KJOERM Turn hydra-notify into a daemon
  • [4] WV4SSAIY Build against nix-master
  • [5] OBOTGFG6 Prevent orphaned build steps
  • [6] B7ENVLRS hydra-queue-runner: Make build notification more reliable
  • [7] NAYQT2GT hydra-queue-runner: Use cmdBuildDerivation
  • [8] TPNHTE5V Remove obsolete Builds columns and provide accurate "Running builds"
  • [9] HH3LID6L Re-implement log size limits
  • [10] 7LWB2J2Z Periodically clear orphaned build steps
  • [11] NKQOEVVP Get rid of "will retry" messages after "maybe cancelling..."
  • [12] BRAESISH Warn if PostgreSQL appears stalled
  • [13] LVQXQIYA Kill active build steps when builds are cancelled
  • [14] DKJFD6JN Process Nix API changes
  • [15] N4IROACV Move buildRemote() into State
  • [16] BG6PEOB2 Make the output size limit configurable
  • [17] UVNTWTWG Prevent download of NARs we just uploaded
  • [18] MJL3PDXI Fix duplicate step_finished notification
  • [19] NTEDD7T4 Provide a plugin hook for when build steps finish
  • [20] UVQJBDHN Move log compression to a plugin
  • [21] MHVIT4JY Split hydra-queue-runner.cc more
  • [22] FITVNQ2S Keep track of the time we spend copying to/from build machines
  • [23] UNVMKJV5 Unify build and step status codes
  • [*] 4I2HF4L3 Unindent
  • [*] IK2UBDAU Revive jobset scheduling
  • [*] 24BMQDZA Start of single-process hydra-queue-runner
  • [*] NWFDDRUG hydra-queue-runner: Limit concurrent database connections
  • [*] HJOEIMLR Refactor
  • [*] TTBLPQAJ Keep track of wait time per system type
  • [*] OG3Z3QGC Namespace cleanup
  • [*] EYR3EW6J Keep stats for the Hydra auto scaler

Change contents

  • replacement in src/hydra-queue-runner/builder.cc at line 379
    [5.9302][5.9302:9315]()
    } else {
    [5.9302]
    [5.303]
    } else
    failStep(*conn, step, buildId, result, machine, stepFinished, quit);
  • replacement in src/hydra-queue-runner/builder.cc at line 382
    [5.304][5.9316:9445](),[5.9316][5.9316:9445]()
    /* Register failure in the database for all Build objects that
    directly or indirectly depend on this step. */
    [5.304]
    [5.9445]
    // FIXME: keep stats about aborted steps?
    nrStepsDone++;
    totalStepTime += stepStopTime - stepStartTime;
    totalStepBuildTime += result.stopTime - result.startTime;
    machine->state->nrStepsDone++;
    machine->state->totalStepTime += stepStopTime - stepStartTime;
    machine->state->totalStepBuildTime += result.stopTime - result.startTime;
  • replacement in src/hydra-queue-runner/builder.cc at line 390
    [5.9446][5.9446:9489]()
    std::vector<BuildID> dependentIDs;
    [5.9446]
    [5.9489]
    if (quit) exit(0); // testing hack; FIXME: this won't run plugins
  • replacement in src/hydra-queue-runner/builder.cc at line 392
    [5.9490][5.9490:9513](),[5.9514][5.9514:9780]()
    while (true) {
    /* Get the builds and steps that depend on this step. */
    std::set<Build::ptr> indirect;
    {
    auto steps_(steps.lock());
    std::set<Step::ptr> steps;
    getDependents(step, indirect, steps);
    [5.9490]
    [5.9780]
    return sDone;
    }
  • edit in src/hydra-queue-runner/builder.cc at line 395
    [5.9781][5.9781:10074](),[5.10074][4.5050:5195](),[4.5195][5.10173:10246](),[5.10173][5.10173:10246](),[5.10273][5.10273:10305]()
    /* If there are no builds left, delete all referring
    steps from ‘steps’. As for the success case, we can
    be certain no new referrers can be added. */
    if (indirect.empty()) {
    for (auto & s : steps) {
    printMsg(lvlDebug, "finishing build step ‘%s’",
    localStore->printStorePath(s->drvPath));
    steps_->erase(s->drvPath);
    }
    }
    }
  • replacement in src/hydra-queue-runner/builder.cc at line 396
    [5.306][5.306:363]()
    if (indirect.empty() && stepFinished) break;
    [5.306]
    [5.10305]
    void State::failStep(
    Connection & conn,
    Step::ptr step,
    BuildID buildId,
    const RemoteResult & result,
    Machine::ptr machine,
    bool & stepFinished,
    bool & quit)
    {
    /* Register failure in the database for all Build objects that
    directly or indirectly depend on this step. */
  • replacement in src/hydra-queue-runner/builder.cc at line 408
    [5.10306][5.10306:10359](),[5.10359][5.125:168]()
    /* Update the database. */
    {
    auto mc = startDbUpdate();
    [5.10306]
    [5.168]
    std::vector<BuildID> dependentIDs;
  • replacement in src/hydra-queue-runner/builder.cc at line 410
    [5.169][5.10359:10398](),[5.10359][5.10359:10398]()
    pqxx::work txn(*conn);
    [5.169]
    [5.10806]
    while (true) {
    /* Get the builds and steps that depend on this step. */
    std::set<Build::ptr> indirect;
    {
    auto steps_(steps.lock());
    std::set<Step::ptr> steps;
    getDependents(step, indirect, steps);
  • replacement in src/hydra-queue-runner/builder.cc at line 418
    [5.11220][5.2802:3078](),[5.3078][5.11545:11594](),[5.11545][5.11545:11594](),[5.11594][5.3079:3181](),[5.3181][5.2138:2229](),[5.2229][5.11736:11816](),[5.3266][5.11736:11816](),[5.11736][5.11736:11816](),[5.11816][5.2230:2408]()
    /* Create failed build steps for every build that
    depends on this, except when this step is cached
    and is the top-level of that build (since then it's
    redundant with the build's isCachedBuild field). */
    for (auto & build2 : indirect) {
    if ((result.stepStatus == bsCachedFailure && build2->drvPath == step->drvPath) ||
    (result.stepStatus != bsCachedFailure && buildId == build2->id) ||
    build2->finishedInDB)
    continue;
    createBuildStep(txn, 0, build2->id, step, machine->sshName,
    result.stepStatus, result.errorMsg, buildId == build2->id ? 0 : buildId);
    [5.10807]
    [5.11984]
    /* If there are no builds left, delete all referring
    steps from ‘steps’. As for the success case, we can
    be certain no new referrers can be added. */
    if (indirect.empty()) {
    for (auto & s : steps) {
    printMsg(lvlDebug, "finishing build step ‘%s’",
    localStore->printStorePath(s->drvPath));
    steps_->erase(s->drvPath);
  • edit in src/hydra-queue-runner/builder.cc at line 427
    [5.12002]
    [5.12002]
    }
    }
  • replacement in src/hydra-queue-runner/builder.cc at line 430
    [5.12212][5.12212:12527](),[5.12527][5.0:190](),[5.184][5.12696:12733](),[5.190][5.12696:12733](),[5.12696][5.12696:12733](),[5.12733][5.3520:3660](),[5.3660][5.12855:12940](),[5.12855][5.12855:12940](),[5.12940][5.3661:3740](),[5.3740][5.12996:13050](),[5.12996][5.12996:13050]()
    /* Mark all builds that depend on this derivation as failed. */
    for (auto & build2 : indirect) {
    if (build2->finishedInDB) continue;
    printMsg(lvlError, format("marking build %1% as failed") % build2->id);
    txn.parameterized
    ("update Builds set finished = 1, buildStatus = $2, startTime = $3, stopTime = $4, isCachedBuild = $5, notificationPendingSince = $4 where id = $1 and finished = 0")
    (build2->id)
    ((int) (build2->drvPath != step->drvPath && result.buildStatus() == bsFailed ? bsDepFailed : result.buildStatus()))
    (result.startTime)
    (result.stopTime)
    (result.stepStatus == bsCachedFailure ? 1 : 0).exec();
    nrBuildsDone++;
    }
    [5.12003]
    [5.13050]
    if (indirect.empty() && stepFinished) break;
  • replacement in src/hydra-queue-runner/builder.cc at line 432
    [5.13051][5.13051:13165](),[5.13165][5.3741:3818](),[5.3818][4.5196:5384]()
    /* Remember failed paths in the database so that they
    won't be built again. */
    if (result.stepStatus != bsCachedFailure && result.canCache)
    for (auto & path : step->drv->outputPaths())
    txn.parameterized("insert into FailedPaths values ($1)")(localStore->printStorePath(path)).exec();
    [5.13051]
    [5.13409]
    /* Update the database. */
    {
    auto mc = startDbUpdate();
  • replacement in src/hydra-queue-runner/builder.cc at line 436
    [5.13410][5.13410:13454]()
    txn.commit();
    }
    [5.13410]
    [5.352]
    pqxx::work txn(conn);
  • replacement in src/hydra-queue-runner/builder.cc at line 438
    [5.353][5.353:386]()
    stepFinished = true;
    [5.353]
    [5.13454]
    /* Create failed build steps for every build that
    depends on this, except when this step is cached
    and is the top-level of that build (since then it's
    redundant with the build's isCachedBuild field). */
    for (auto & build : indirect) {
    if ((result.stepStatus == bsCachedFailure && build->drvPath == step->drvPath) ||
    ((result.stepStatus != bsCachedFailure && result.stepStatus != bsUnsupported) && buildId == build->id) ||
    build->finishedInDB)
    continue;
    createBuildStep(txn,
    0, build->id, step, machine ? machine->sshName : "",
    result.stepStatus, result.errorMsg, buildId == build->id ? 0 : buildId);
    }
  • replacement in src/hydra-queue-runner/builder.cc at line 452
    [5.13455][5.13455:13841]()
    /* Remove the indirect dependencies from ‘builds’. This
    will cause them to be destroyed. */
    for (auto & b : indirect) {
    auto builds_(builds.lock());
    b->finishedInDB = true;
    builds_->erase(b->id);
    dependentIDs.push_back(b->id);
    if (buildOne == b->id) quit = true;
    [5.13455]
    [5.13841]
    /* Mark all builds that depend on this derivation as failed. */
    for (auto & build : indirect) {
    if (build->finishedInDB) continue;
    printMsg(lvlError, format("marking build %1% as failed") % build->id);
    txn.parameterized
    ("update Builds set finished = 1, buildStatus = $2, startTime = $3, stopTime = $4, isCachedBuild = $5, notificationPendingSince = $4 where id = $1 and finished = 0")
    (build->id)
    ((int) (build->drvPath != step->drvPath && result.buildStatus() == bsFailed ? bsDepFailed : result.buildStatus()))
    (result.startTime)
    (result.stopTime)
    (result.stepStatus == bsCachedFailure ? 1 : 0).exec();
    nrBuildsDone++;
  • edit in src/hydra-queue-runner/builder.cc at line 465
    [5.13855][5.13855:13865]()
    }
  • replacement in src/hydra-queue-runner/builder.cc at line 466
    [5.13866][5.13866:13945](),[5.13945][3.354:450]()
    /* Send notification about this build and its dependents. */
    {
    pqxx::work txn(*conn);
    notifyBuildFinished(txn, buildId, dependentIDs);
    [5.13866]
    [3.450]
    /* Remember failed paths in the database so that they
    won't be built again. */
    if (result.stepStatus != bsCachedFailure && result.canCache)
    for (auto & path : step->drv->outputPaths())
    txn.parameterized("insert into FailedPaths values ($1)")(localStore->printStorePath(path)).exec();
  • edit in src/hydra-queue-runner/builder.cc at line 474
    [5.14165][5.14165:14171]()
    }
  • replacement in src/hydra-queue-runner/builder.cc at line 475
    [5.14172][5.14172:14530]()
    // FIXME: keep stats about aborted steps?
    nrStepsDone++;
    totalStepTime += stepStopTime - stepStartTime;
    totalStepBuildTime += result.stopTime - result.startTime;
    machine->state->nrStepsDone++;
    machine->state->totalStepTime += stepStopTime - stepStartTime;
    machine->state->totalStepBuildTime += result.stopTime - result.startTime;
    [5.14172]
    [5.14530]
    stepFinished = true;
  • replacement in src/hydra-queue-runner/builder.cc at line 477
    [5.14531][5.321:391]()
    if (quit) exit(0); // testing hack; FIXME: this won't run plugins
    [5.14531]
    [5.14570]
    /* Remove the indirect dependencies from ‘builds’. This
    will cause them to be destroyed. */
    for (auto & b : indirect) {
    auto builds_(builds.lock());
    b->finishedInDB = true;
    builds_->erase(b->id);
    dependentIDs.push_back(b->id);
    if (buildOne == b->id) quit = true;
    }
    }
  • replacement in src/hydra-queue-runner/builder.cc at line 488
    [5.14571][5.297:315]()
    return sDone;
    [5.14571]
    [5.14589]
    /* Send notification about this build and its dependents. */
    {
    pqxx::work txn(conn);
    notifyBuildFinished(txn, buildId, dependentIDs);
    txn.commit();
    }
  • edit in src/hydra-queue-runner/dispatcher.cc at line 302
    [25.4651]
    [5.19812]
    abortUnsupported();
  • edit in src/hydra-queue-runner/dispatcher.cc at line 317
    [5.19999]
    [26.1304]
    void State::abortUnsupported()
    {
    /* Make a copy of 'runnable' and 'machines' so we don't block them
    very long. */
    auto runnable2 = *runnable.lock();
    auto machines2 = *machines.lock();
    system_time now = std::chrono::system_clock::now();
    auto now2 = time(0);
  • edit in src/hydra-queue-runner/dispatcher.cc at line 329
    [26.1305]
    [26.1305]
    std::unordered_set<Step::ptr> aborted;
  • edit in src/hydra-queue-runner/dispatcher.cc at line 331
    [26.1306]
    [26.1306]
    for (auto & wstep : runnable2) {
    auto step(wstep.lock());
    if (!step) continue;
    bool supported = false;
    for (auto & machine : machines2) {
    if (machine.second->supportsStep(step)) {
    step->state.lock()->lastSupported = now;
    supported = true;
    break;
    }
    }
    if (!supported
    && std::chrono::duration_cast<std::chrono::seconds>(now - step->state.lock()->lastSupported).count() >= maxUnsupportedTime)
    {
    printError("aborting unsupported build step '%s' (type '%s')",
    localStore->printStorePath(step->drvPath),
    step->systemType);
    aborted.insert(step);
    auto conn(dbPool.get());
    std::set<Build::ptr> dependents;
    std::set<Step::ptr> steps;
    getDependents(step, dependents, steps);
    /* Maybe the step got cancelled. */
    if (dependents.empty()) continue;
    /* Find the build that has this step as the top-level (if
    any). */
    Build::ptr build;
    for (auto build2 : dependents) {
    if (build2->drvPath == step->drvPath)
    build = build2;
    }
    if (!build) build = *dependents.begin();
    bool stepFinished = false;
    bool quit = false;
    failStep(
    *conn, step, build->id,
    RemoteResult {
    .stepStatus = bsUnsupported,
    .errorMsg = fmt("unsupported system type '%s'",
    step->systemType),
    .startTime = now2,
    .stopTime = now2,
    },
    nullptr, stepFinished, quit);
    if (quit) exit(1);
    }
    }
    /* Clean up 'runnable'. */
    {
    auto runnable_(runnable.lock());
    for (auto i = runnable_->begin(); i != runnable_->end(); ) {
    if (aborted.count(i->lock()))
    i = runnable_->erase(i);
    else
    ++i;
    }
    }
    }
  • edit in src/hydra-queue-runner/hydra-queue-runner.cc at line 49
    [2.43]
    [28.0]
    , maxUnsupportedTime(config->getIntOption("max_unsupported_time", 0))
  • replacement in src/hydra-queue-runner/state.hh at line 71
    [5.3662][5.4211:4241]()
    BuildStatus buildStatus()
    [5.3662]
    [5.3682]
    BuildStatus buildStatus() const
  • edit in src/hydra-queue-runner/state.hh at line 201
    [30.1366]
    [29.3082]
    /* The time that we last saw a machine that supports this
    step. */
    system_time lastSupported = std::chrono::system_clock::now();
  • edit in src/hydra-queue-runner/state.hh at line 310
    [29.4357]
    [31.404]
    /* Time in seconds before unsupported build steps are aborted. */
    const unsigned int maxUnsupportedTime = 0;
  • edit in src/hydra-queue-runner/state.hh at line 492
    [29.8032]
    [29.8032]
    void failStep(
    Connection & conn,
    Step::ptr step,
    BuildID buildId,
    const RemoteResult & result,
    Machine::ptr machine,
    bool & stepFinished,
    bool & quit);
  • edit in src/hydra-queue-runner/state.hh at line 516
    [29.8187]
    [32.3028]
    void abortUnsupported();