Step cancellation: Don't use pthread_cancel()

[?]
Nov 7, 2016, 6:34 PM
2DNPZFPNI2OM5FKYTC2KE5NKKKAP45AQ2VDDYLZZHCJ35X3EBJRQC

Dependencies

  • [2] NKQOEVVP Get rid of "will retry" messages after "maybe cancelling..."
  • [3] XCDTFZUY hydra-queue-runner: Fix build
  • [4] KPKXKDNG hydra-queue-runner: Fix assertion failure
  • [5] MBWLLEYE hydra-queue-runner: Fix message
  • [6] TTBLPQAJ Keep track of wait time per system type
  • [7] WE5Q2NVI Allow build to be bumped to the front of the queue via the web interface
  • [8] 73YR46NJ hydra-queue-runner: Write directly to a binary cache
  • [9] EHEQ4AY3 Fix retry of transient failures
  • [10] N4IROACV Move buildRemote() into State
  • [11] MHVIT4JY Split hydra-queue-runner.cc more
  • [12] YR2IM6Y5 Temporarily disable machines after a connection failure
  • [13] UNVMKJV5 Unify build and step status codes
  • [14] BG6PEOB2 Make the output size limit configurable
  • [15] DKJFD6JN Process Nix API changes
  • [16] 5AIYUMTB Basic remote building
  • [17] 6LIYTMFU Fix build failure on GCC 5.4
  • [18] LVQXQIYA Kill active build steps when builds are cancelled
  • [19] HJOEIMLR Refactor
  • [*] DIEY5USN Keep better bytesReceived/bytesSent stats

Change contents

  • replacement in src/hydra-queue-runner/build-remote.cc at line 120
    [6.274][6.274:301]()
    RemoteResult & result)
    [6.274]
    [6.2925]
    RemoteResult & result, std::shared_ptr<ActiveStep> activeStep)
  • edit in src/hydra-queue-runner/build-remote.cc at line 140
    [3.553]
    [21.23]
    {
    auto activeStepState(activeStep->state_.lock());
    if (activeStepState->cancelled) throw Error("step cancelled");
    activeStepState->pid = child.pid;
    }
    Finally clearPid([&]() {
    auto activeStepState(activeStep->state_.lock());
    activeStepState->pid = -1;
    /* FIXME: there is a slight race here with step
    cancellation in State::processQueueChange(), which
    could call kill() on this pid after we've done waitpid()
    on it. With pid wrap-around, there is a tiny
    possibility that we end up killing another
    process. Meh. */
    });
  • edit in src/hydra-queue-runner/builder.cc at line 18
    [4.96][4.96:139]()
    activeStep->threadId = pthread_self();
  • edit in src/hydra-queue-runner/builder.cc at line 21
    [6.130][4.185:220]()
    activeStep->threadId = -1;
  • replacement in src/hydra-queue-runner/builder.cc at line 28
    [6.1193][2.30:96]()
    res = doBuildStep(destStore, step, reservation->machine);
    [6.1193]
    [6.499]
    res = doBuildStep(destStore, reservation, activeStep);
  • replacement in src/hydra-queue-runner/builder.cc at line 57
    [6.1479][2.151:231](),[2.231][6.1552:1578](),[6.694][6.1552:1578](),[6.1329][6.1552:1578](),[6.1552][6.1552:1578]()
    State::StepResult State::doBuildStep(nix::ref<Store> destStore, Step::ptr step,
    Machine::ptr machine)
    [6.1479]
    [6.1578]
    State::StepResult State::doBuildStep(nix::ref<Store> destStore,
    MachineReservation::ptr reservation,
    std::shared_ptr<ActiveStep> activeStep)
  • edit in src/hydra-queue-runner/builder.cc at line 61
    [6.1580]
    [6.1580]
    auto & step(reservation->step);
    auto & machine(reservation->machine);
  • replacement in src/hydra-queue-runner/builder.cc at line 161
    [6.4376][6.1142:1230]()
    buildRemote(destStore, machine, step, maxSilentTime, buildTimeout, result);
    [6.4376]
    [6.2415]
    buildRemote(destStore, machine, step, maxSilentTime, buildTimeout, result, activeStep);
  • replacement in src/hydra-queue-runner/builder.cc at line 165
    [6.4504][6.2505:2548](),[6.2548][6.4558:4597](),[6.4558][6.4558:4597](),[6.4597][6.0:36](),[6.36][6.1231:1365]()
    result.stepStatus = bsAborted;
    result.errorMsg = e.msg();
    result.canRetry = true;
    } catch (__cxxabiv1::__forced_unwind & e) {
    /* The queue monitor thread cancelled this step. */
    try {
    [6.4504]
    [5.0]
    if (activeStep->state_.lock()->cancelled) {
  • replacement in src/hydra-queue-runner/builder.cc at line 167
    [5.88][6.1453:1776](),[6.1453][6.1453:1776]()
    pqxx::work txn(*conn);
    finishBuildStep(txn, result.startTime, time(0), result.overhead, buildId,
    stepNr, machine->sshName, bsCancelled, "");
    txn.commit();
    stepFinished = true;
    } catch (...) {
    ignoreException();
    [5.88]
    [6.1776]
    result.stepStatus = bsCancelled;
    result.canRetry = false;
    } else {
    result.stepStatus = bsAborted;
    result.errorMsg = e.msg();
    result.canRetry = true;
  • edit in src/hydra-queue-runner/builder.cc at line 174
    [6.1790][6.1790:1809]()
    throw;
  • edit in src/hydra-queue-runner/queue-monitor.cc at line 340
    [6.4131][6.4131:4253]()
    auto threadId = activeStep->threadId; // FIXME: use Sync or atomic?
    if (threadId == 0) continue;
  • replacement in src/hydra-queue-runner/queue-monitor.cc at line 345
    [6.4449][6.4449:4759]()
    printInfo("cancelling thread for build step ‘%s’", activeStep->step->drvPath);
    int err = pthread_cancel(threadId);
    if (err)
    printError("error cancelling thread for build step ‘%s’: %s",
    activeStep->step->drvPath, strerror(err));
    [6.4449]
    [6.1940]
    {
    auto activeStepState(activeStep->state_.lock());
    if (activeStepState->cancelled) continue;
    activeStepState->cancelled = true;
    if (activeStepState->pid != -1) {
    printInfo("killing builder process %d of build step ‘%s’",
    activeStepState->pid, activeStep->step->drvPath);
    if (kill(activeStepState->pid, SIGINT) == -1)
    printError("error killing build step ‘%s’: %s",
    activeStep->step->drvPath, strerror(errno));
    }
    }
  • replacement in src/hydra-queue-runner/state.hh at line 380
    [4.321][4.321:349]()
    pthread_t threadId;
    [4.321]
    [4.349]
    struct State
    {
    pid_t pid = -1;
    bool cancelled = false;
    };
    nix::Sync<State> state_;
  • replacement in src/hydra-queue-runner/state.hh at line 486
    [2.431][2.431:478]()
    Step::ptr step, Machine::ptr machine);
    [2.431]
    [6.8478]
    MachineReservation::ptr reservation,
    std::shared_ptr<ActiveStep> activeStep);
  • replacement in src/hydra-queue-runner/state.hh at line 492
    [6.1984][6.1984:2016]()
    RemoteResult & result);
    [6.1984]
    [6.2016]
    RemoteResult & result, std::shared_ptr<ActiveStep> activeStep);