Temporarily disable machines on any exception, not just connection failures
[?]
Mar 22, 2016, 3:54 PM
BYVRA54QBKHLFOPIRBJKZZI7JYBYHSOK7MIA3TUZTALZQJGG3G7QCDependencies
- [2]
4VYY2ADPAllow the machines file to specify host public keys - [3]
DIEY5USNKeep better bytesReceived/bytesSent stats - [4]
BG6PEOB2Make the output size limit configurable - [5]
HH3LID6LRe-implement log size limits - [6]
FITVNQ2SKeep track of the time we spend copying to/from build machines - [7]
TDSBTZKXAdd log message - [8]
5AIYUMTBBasic remote building - [9]
HHOMBU7Ghydra-queue-runner: Implement timeouts - [10]
YR2IM6Y5Temporarily disable machines after a connection failure - [11]
A2GL5FOZMoar stats - [12]
NAYQT2GThydra-queue-runner: Use cmdBuildDerivation - [13]
UVNTWTWGPrevent download of NARs we just uploaded - [14]
6EO3HVNAMerge remote-tracking branch 'origin/master' into binary-cache - [15]
73YR46NJhydra-queue-runner: Write directly to a binary cache - [16]
MB3TISH2Rate-limit the number of threads copying closures at the same time - [17]
GS4BE6TBAsynchronously compress build logs - [18]
CNLNT3T4Allow only 1 thread to send a closure to a given machine at the same time - [19]
OCZ4LSGGAutomatically retry aborted builds - [20]
VZKB5CIEWorkaround for RemoteStore not supporting cmdBuildDerivation yet - [21]
AUMIJSEOFix remote building on Nix 1.10 - [22]
5LBMP7GAFix remote building - [23]
AF74AH2SRemove localhost hack - [24]
N4IROACVMove buildRemote() into State - [25]
DWFTK56EKeep track of how many threads are waiting - [26]
7LB6QBXYKeep track of the number of build steps that are being built - [27]
RSISSEU6Enable substitution on the build machines - [28]
SL3WSRAChydra-queue-runner: Limit memory usage - [29]
LE4VZIY5More stats
Change contents
- replacement in src/hydra-queue-runner/build-remote.cc at line 133
Child child;openConnection(machine, tmpDir, logFD, child);try { - replacement in src/hydra-queue-runner/build-remote.cc at line 135
logFD.close();Child child;openConnection(machine, tmpDir, logFD, child); - replacement in src/hydra-queue-runner/build-remote.cc at line 138
FdSource from(child.from);FdSink to(child.to);logFD.close(); - replacement in src/hydra-queue-runner/build-remote.cc at line 140
Finally updateStats([&]() {bytesReceived += from.read;bytesSent += to.written;});FdSource from(child.from);FdSink to(child.to); - replacement in src/hydra-queue-runner/build-remote.cc at line 143
/* Handshake. */bool sendDerivation = true;unsigned int remoteVersion;Finally updateStats([&]() {bytesReceived += from.read;bytesSent += to.written;}); - replacement in src/hydra-queue-runner/build-remote.cc at line 148[5.33]→[5.79:89](∅→∅),[5.165]→[5.79:89](∅→∅),[5.3414]→[5.79:89](∅→∅),[5.89]→[5.34:72](∅→∅),[5.72]→[5.172:192](∅→∅),[5.221]→[5.172:192](∅→∅),[5.172]→[5.172:192](∅→∅)
try {to << SERVE_MAGIC_1 << 0x202;to.flush();/* Handshake. */bool sendDerivation = true;unsigned int remoteVersion; - replacement in src/hydra-queue-runner/build-remote.cc at line 152[5.3506]→[5.193:273](∅→∅),[5.273]→[5.418:533](∅→∅),[5.533]→[5.73:168](∅→∅),[5.168]→[5.534:655](∅→∅),[5.475]→[5.534:655](∅→∅),[5.655]→[5.169:221](∅→∅),[5.46]→[5.268:304](∅→∅),[5.90]→[5.268:304](∅→∅),[5.221]→[5.268:304](∅→∅),[5.268]→[5.268:304](∅→∅)
unsigned int magic = readInt(from);if (magic != SERVE_MAGIC_2)throw Error(format("protocol mismatch with ‘nix-store --serve’ on ‘%1%’") % machine->sshName);remoteVersion = readInt(from);if (GET_PROTOCOL_MAJOR(remoteVersion) != 0x200)throw Error(format("unsupported ‘nix-store --serve’ protocol version on ‘%1%’") % machine->sshName);if (GET_PROTOCOL_MINOR(remoteVersion) >= 1)sendDerivation = false;try {to << SERVE_MAGIC_1 << 0x202;to.flush();unsigned int magic = readInt(from);if (magic != SERVE_MAGIC_2)throw Error(format("protocol mismatch with ‘nix-store --serve’ on ‘%1%’") % machine->sshName);remoteVersion = readInt(from);if (GET_PROTOCOL_MAJOR(remoteVersion) != 0x200)throw Error(format("unsupported ‘nix-store --serve’ protocol version on ‘%1%’") % machine->sshName);if (GET_PROTOCOL_MINOR(remoteVersion) >= 1)sendDerivation = false; - replacement in src/hydra-queue-runner/build-remote.cc at line 165
} catch (EndOfFile & e) {child.pid.wait(true);} catch (EndOfFile & e) {child.pid.wait(true);string s = chomp(readFile(result.logFile));throw Error(format("cannot connect to ‘%1%’: %2%") % machine->sshName % s);} - edit in src/hydra-queue-runner/build-remote.cc at line 172
/* Disable this machine until a certain period of time haspassed. This period increases on every consecutivefailure. However, don't count failures that occurredsoon after the last one (to take into account stepsstarted in parallel). */ - replacement in src/hydra-queue-runner/build-remote.cc at line 173
auto now = std::chrono::system_clock::now();if (info->consecutiveFailures == 0 || info->lastFailure < now - std::chrono::seconds(30)) {info->consecutiveFailures = std::min(info->consecutiveFailures + 1, (unsigned int) 4);info->lastFailure = now;int delta = retryInterval * powf(retryBackoff, info->consecutiveFailures - 1) + (rand() % 30);printMsg(lvlInfo, format("will disable machine ‘%1%’ for %2%s") % machine->sshName % delta);info->disabledUntil = now + std::chrono::seconds(delta);}info->consecutiveFailures = 0; - replacement in src/hydra-queue-runner/build-remote.cc at line 176[5.1009]→[5.354:406](∅→∅),[5.647]→[5.354:406](∅→∅),[5.406]→[5.656:744](∅→∅),[5.485]→[5.0:7](∅→∅),[5.744]→[5.0:7](∅→∅),[5.749]→[5.0:7](∅→∅),[5.7]→[5.1010:1117](∅→∅),[5.1117]→[5.305:376](∅→∅),[5.7]→[5.305:376](∅→∅),[5.376]→[5.0:73](∅→∅),[5.73]→[5.446:650](∅→∅),[5.446]→[5.446:650](∅→∅),[5.650]→[5.74:115](∅→∅)
string s = chomp(readFile(result.logFile));throw Error(format("cannot connect to ‘%1%’: %2%") % machine->sshName % s);}{auto info(machine->state->connectInfo.lock());info->consecutiveFailures = 0;}/* Gather the inputs. If the remote side is Nix <= 1.9, we have tocopy the entire closure of ‘drvPath’, as well as the requiredoutputs of the input derivations. On Nix > 1.9, we only need tocopy the immediate sources of the derivation and the requiredoutputs of the input derivations. */PathSet inputs;BasicDerivation basicDrv(step->drv);/* Gather the inputs. If the remote side is Nix <= 1.9, we have tocopy the entire closure of ‘drvPath’, as well as the requiredoutputs of the input derivations. On Nix > 1.9, we only need tocopy the immediate sources of the derivation and the requiredoutputs of the input derivations. */PathSet inputs;BasicDerivation basicDrv(step->drv); - replacement in src/hydra-queue-runner/build-remote.cc at line 184
if (sendDerivation)inputs.insert(step->drvPath);elsefor (auto & p : step->drv.inputSrcs)inputs.insert(p);if (sendDerivation)inputs.insert(step->drvPath);elsefor (auto & p : step->drv.inputSrcs)inputs.insert(p); - replacement in src/hydra-queue-runner/build-remote.cc at line 190[5.798]→[5.782:829](∅→∅),[5.782]→[5.782:829](∅→∅),[5.829]→[5.108:252](∅→∅),[5.108]→[5.108:252](∅→∅),[5.252]→[5.799:893](∅→∅),[5.893]→[5.116:171](∅→∅)
for (auto & input : step->drv.inputDrvs) {Derivation drv2 = readDerivation(input.first);for (auto & name : input.second) {auto i = drv2.outputs.find(name);if (i == drv2.outputs.end()) continue;inputs.insert(i->second.path);basicDrv.inputSrcs.insert(i->second.path);for (auto & input : step->drv.inputDrvs) {Derivation drv2 = readDerivation(input.first);for (auto & name : input.second) {auto i = drv2.outputs.find(name);if (i == drv2.outputs.end()) continue;inputs.insert(i->second.path);basicDrv.inputSrcs.insert(i->second.path);} - edit in src/hydra-queue-runner/build-remote.cc at line 199
} - replacement in src/hydra-queue-runner/build-remote.cc at line 200
/* Ensure that the inputs exist in the destination store. This isa no-op for regular stores, but for the binary cache store,this will copy the inputs to the binary cache from the localstore. */destStore->buildPaths(basicDrv.inputSrcs);/* Ensure that the inputs exist in the destination store. This isa no-op for regular stores, but for the binary cache store,this will copy the inputs to the binary cache from the localstore. */destStore->buildPaths(basicDrv.inputSrcs); - replacement in src/hydra-queue-runner/build-remote.cc at line 206[5.3877]→[5.3877:3911](∅→∅),[5.3911]→[5.862:916](∅→∅),[5.916]→[5.0:68](∅→∅),[5.873]→[5.0:68](∅→∅),[5.68]→[5.26:98](∅→∅),[5.205]→[5.26:98](∅→∅),[5.98]→[5.69:251](∅→∅)
/* Copy the input closure. */if (/* machine->sshName != "localhost" */ true) {auto mc1 = std::make_shared<MaintainCount>(nrStepsWaiting);std::lock_guard<std::mutex> sendLock(machine->state->sendLock);mc1.reset();MaintainCount mc2(nrStepsCopyingTo);printMsg(lvlDebug, format("sending closure of ‘%1%’ to ‘%2%’") % step->drvPath % machine->sshName);/* Copy the input closure. */if (/* machine->sshName != "localhost" */ true) {auto mc1 = std::make_shared<MaintainCount>(nrStepsWaiting);std::lock_guard<std::mutex> sendLock(machine->state->sendLock);mc1.reset();MaintainCount mc2(nrStepsCopyingTo);printMsg(lvlDebug, format("sending closure of ‘%1%’ to ‘%2%’") % step->drvPath % machine->sshName); - replacement in src/hydra-queue-runner/build-remote.cc at line 214
auto now1 = std::chrono::steady_clock::now();auto now1 = std::chrono::steady_clock::now(); - replacement in src/hydra-queue-runner/build-remote.cc at line 216
copyClosureTo(destStore, from, to, inputs, true);copyClosureTo(destStore, from, to, inputs, true); - replacement in src/hydra-queue-runner/build-remote.cc at line 218
auto now2 = std::chrono::steady_clock::now();auto now2 = std::chrono::steady_clock::now(); - replacement in src/hydra-queue-runner/build-remote.cc at line 220[5.113]→[5.113:216](∅→∅),[5.157]→[5.277:283](∅→∅),[5.216]→[5.277:283](∅→∅),[5.678]→[5.277:283](∅→∅),[5.980]→[5.277:283](∅→∅),[5.277]→[5.277:283](∅→∅)
result.overhead += std::chrono::duration_cast<std::chrono::milliseconds>(now2 - now1).count();}result.overhead += std::chrono::duration_cast<std::chrono::milliseconds>(now2 - now1).count();} - replacement in src/hydra-queue-runner/build-remote.cc at line 223
autoDelete.cancel();autoDelete.cancel(); - replacement in src/hydra-queue-runner/build-remote.cc at line 225
/* Do the build. */printMsg(lvlDebug, format("building ‘%1%’ on ‘%2%’") % step->drvPath % machine->sshName);/* Do the build. */printMsg(lvlDebug, format("building ‘%1%’ on ‘%2%’") % step->drvPath % machine->sshName); - replacement in src/hydra-queue-runner/build-remote.cc at line 228[5.895]→[5.895:919](∅→∅),[5.919]→[5.222:279](∅→∅),[5.279]→[5.1009:1018](∅→∅),[5.1009]→[5.1009:1018](∅→∅),[5.1018]→[5.280:481](∅→∅),[5.209]→[5.4352:4368](∅→∅),[5.481]→[5.4352:4368](∅→∅),[5.1150]→[5.4352:4368](∅→∅),[5.4352]→[5.4352:4368](∅→∅)
if (sendDerivation)to << cmdBuildPaths << PathSet({step->drvPath});elseto << cmdBuildDerivation << step->drvPath << basicDrv;to << maxSilentTime << buildTimeout;if (GET_PROTOCOL_MINOR(remoteVersion) >= 2)to << 64 * 1024 * 1024; // == maxLogSizeto.flush();if (sendDerivation)to << cmdBuildPaths << PathSet({step->drvPath});elseto << cmdBuildDerivation << step->drvPath << basicDrv;to << maxSilentTime << buildTimeout;if (GET_PROTOCOL_MINOR(remoteVersion) >= 2)to << 64 * 1024 * 1024; // == maxLogSizeto.flush(); - replacement in src/hydra-queue-runner/build-remote.cc at line 237[5.1152]→[5.4368:4400](∅→∅),[5.4368]→[5.4368:4400](∅→∅),[5.4400]→[5.55:152](∅→∅),[5.152]→[5.4429:4460](∅→∅),[5.4429]→[5.4429:4460](∅→∅)
result.startTime = time(0);int res;{MaintainCount mc(nrStepsBuilding);res = readInt(from);}result.stopTime = time(0);result.startTime = time(0);int res;{MaintainCount mc(nrStepsBuilding);res = readInt(from);}result.stopTime = time(0); - replacement in src/hydra-queue-runner/build-remote.cc at line 245
if (sendDerivation) {if (res) {result.errorMsg = (format("%1% on ‘%2%’") % readString(from) % machine->sshName).str();if (res == 100) {result.stepStatus = bsFailed;result.canCache = true;if (sendDerivation) {if (res) {result.errorMsg = (format("%1% on ‘%2%’") % readString(from) % machine->sshName).str();if (res == 100) {result.stepStatus = bsFailed;result.canCache = true;}else if (res == 101) {result.stepStatus = bsTimedOut;}else {result.stepStatus = bsAborted;result.canRetry = true;}return; - replacement in src/hydra-queue-runner/build-remote.cc at line 261
else if (res == 101) {result.stepStatus = bsTimedOut;}else {result.stepStatus = bsAborted;result.canRetry = true;result.stepStatus = bsSuccess;} else {result.errorMsg = readString(from);switch ((BuildResult::Status) res) {case BuildResult::Built:result.stepStatus = bsSuccess;break;case BuildResult::Substituted:case BuildResult::AlreadyValid:result.stepStatus = bsSuccess;result.isCached = true;break;case BuildResult::PermanentFailure:result.stepStatus = bsFailed;result.canCache = true;result.errorMsg = "";break;case BuildResult::InputRejected:case BuildResult::OutputRejected:result.stepStatus = bsFailed;result.canCache = true;break;case BuildResult::TransientFailure:result.stepStatus = bsFailed;result.canRetry = true;result.errorMsg = "";break;case BuildResult::CachedFailure: // cached on the build machineresult.stepStatus = bsCachedFailure;result.canCache = true;result.errorMsg = "";break;case BuildResult::TimedOut:result.stepStatus = bsTimedOut;result.errorMsg = "";break;case BuildResult::MiscFailure:result.stepStatus = bsAborted;result.canRetry = true;break;case BuildResult::LogLimitExceeded:result.stepStatus = bsLogLimitExceeded;break;default:result.stepStatus = bsAborted;break; - replacement in src/hydra-queue-runner/build-remote.cc at line 308
return;if (result.stepStatus != bsSuccess) return; - replacement in src/hydra-queue-runner/build-remote.cc at line 310[5.1539]→[4.348:387](∅→∅),[4.387]→[5.1583:1596](∅→∅),[5.1583]→[5.1583:1596](∅→∅),[5.1647]→[5.1647:1691](∅→∅),[5.1691]→[4.388:2077](∅→∅)
result.stepStatus = bsSuccess;} else {result.errorMsg = readString(from);switch ((BuildResult::Status) res) {case BuildResult::Built:result.stepStatus = bsSuccess;break;case BuildResult::Substituted:case BuildResult::AlreadyValid:result.stepStatus = bsSuccess;result.isCached = true;break;case BuildResult::PermanentFailure:result.stepStatus = bsFailed;result.canCache = true;result.errorMsg = "";break;case BuildResult::InputRejected:case BuildResult::OutputRejected:result.stepStatus = bsFailed;result.canCache = true;break;case BuildResult::TransientFailure:result.stepStatus = bsFailed;result.canRetry = true;result.errorMsg = "";break;case BuildResult::CachedFailure: // cached on the build machineresult.stepStatus = bsCachedFailure;result.canCache = true;result.errorMsg = "";break;case BuildResult::TimedOut:result.stepStatus = bsTimedOut;result.errorMsg = "";break;case BuildResult::MiscFailure:result.stepStatus = bsAborted;result.canRetry = true;break;case BuildResult::LogLimitExceeded:result.stepStatus = bsLogLimitExceeded;break;default:result.stepStatus = bsAborted;break;result.errorMsg = "";/* If the path was substituted or already valid, then we didn'tget a build log. */if (result.isCached) {printMsg(lvlInfo, format("outputs of ‘%1%’ substituted or already valid on ‘%2%’") % step->drvPath % machine->sshName);unlink(result.logFile.c_str());result.logFile = ""; - edit in src/hydra-queue-runner/build-remote.cc at line 320[4.2087]→[4.2087:2139](∅→∅),[5.1730]→[5.4785:4791](∅→∅),[4.2139]→[5.4785:4791](∅→∅),[5.4785]→[5.4785:4791](∅→∅)
if (result.stepStatus != bsSuccess) return;} - replacement in src/hydra-queue-runner/build-remote.cc at line 321
result.errorMsg = "";/* Copy the output paths. */if (/* machine->sshName != "localhost" */ true) {MaintainCount mc(nrStepsCopyingFrom); - replacement in src/hydra-queue-runner/build-remote.cc at line 325[5.4792]→[5.1731:1826](∅→∅),[5.1826]→[4.2168:2195](∅→∅),[4.2195]→[5.0:136](∅→∅),[5.1925]→[5.0:136](∅→∅),[5.136]→[5.1925:2000](∅→∅),[5.1925]→[5.1925:2000](∅→∅)
/* If the path was substituted or already valid, then we didn'tget a build log. */if (result.isCached) {printMsg(lvlInfo, format("outputs of ‘%1%’ substituted or already valid on ‘%2%’") % step->drvPath % machine->sshName);unlink(result.logFile.c_str());result.logFile = "";}auto now1 = std::chrono::steady_clock::now(); - replacement in src/hydra-queue-runner/build-remote.cc at line 327[5.2001]→[5.4792:4825](∅→∅),[5.4792]→[5.4792:4825](∅→∅),[5.4825]→[5.981:1035](∅→∅),[5.1035]→[5.0:46](∅→∅)
/* Copy the output paths. */if (/* machine->sshName != "localhost" */ true) {MaintainCount mc(nrStepsCopyingFrom);PathSet outputs;for (auto & output : step->drv.outputs)outputs.insert(output.second.path); - replacement in src/hydra-queue-runner/build-remote.cc at line 331
auto now1 = std::chrono::steady_clock::now();/* Query the size of the output paths. */size_t totalNarSize = 0;to << cmdQueryPathInfos << outputs;to.flush();while (true) {if (readString(from) == "") break;readString(from); // deriverreadStrings<PathSet>(from); // referencesreadLongLong(from); // download sizetotalNarSize += readLongLong(from);} - replacement in src/hydra-queue-runner/build-remote.cc at line 343[5.102]→[5.816:841](∅→∅),[5.1404]→[5.816:841](∅→∅),[5.816]→[5.816:841](∅→∅),[5.841]→[5.1405:1453](∅→∅),[5.1453]→[5.883:931](∅→∅),[5.883]→[5.883:931](∅→∅)
PathSet outputs;for (auto & output : step->drv.outputs)outputs.insert(output.second.path);if (totalNarSize > maxOutputSize) {result.stepStatus = bsNarSizeLimitExceeded;return;} - replacement in src/hydra-queue-runner/build-remote.cc at line 348
/* Query the size of the output paths. */size_t totalNarSize = 0;to << cmdQueryPathInfos << outputs;to.flush();while (true) {if (readString(from) == "") break;readString(from); // deriverreadStrings<PathSet>(from); // referencesreadLongLong(from); // download sizetotalNarSize += readLongLong(from);}printMsg(lvlDebug, format("copying outputs of ‘%s’ from ‘%s’ (%d bytes)")% step->drvPath % machine->sshName % totalNarSize); - replacement in src/hydra-queue-runner/build-remote.cc at line 351
if (totalNarSize > maxOutputSize) {result.stepStatus = bsNarSizeLimitExceeded;return;}/* Block until we have the required amount of memoryavailable. FIXME: only need this for binary cachedestination stores. */auto resStart = std::chrono::steady_clock::now();auto memoryReservation(memoryTokens.get(totalNarSize));auto resStop = std::chrono::steady_clock::now(); - replacement in src/hydra-queue-runner/build-remote.cc at line 358
printMsg(lvlDebug, format("copying outputs of ‘%s’ from ‘%s’ (%d bytes)")% step->drvPath % machine->sshName % totalNarSize);auto resMs = std::chrono::duration_cast<std::chrono::milliseconds>(resStop - resStart).count();if (resMs >= 1000)printMsg(lvlError, format("warning: had to wait %d ms for %d memory tokens for %s")% resMs % totalNarSize % step->drvPath); - replacement in src/hydra-queue-runner/build-remote.cc at line 363
/* Block until we have the required amount of memoryavailable. FIXME: only need this for binary cachedestination stores. */auto resStart = std::chrono::steady_clock::now();auto memoryReservation(memoryTokens.get(totalNarSize));auto resStop = std::chrono::steady_clock::now();result.accessor = destStore->getFSAccessor(); - replacement in src/hydra-queue-runner/build-remote.cc at line 365
auto resMs = std::chrono::duration_cast<std::chrono::milliseconds>(resStop - resStart).count();if (resMs >= 1000)printMsg(lvlError, format("warning: had to wait %d ms for %d memory tokens for %s")% resMs % totalNarSize % step->drvPath);to << cmdExportPaths << 0 << outputs;to.flush();destStore->importPaths(false, from, result.accessor); - replacement in src/hydra-queue-runner/build-remote.cc at line 369
result.accessor = destStore->getFSAccessor();auto now2 = std::chrono::steady_clock::now(); - replacement in src/hydra-queue-runner/build-remote.cc at line 371
to << cmdExportPaths << 0 << outputs;to.flush();destStore->importPaths(false, from, result.accessor);result.overhead += std::chrono::duration_cast<std::chrono::milliseconds>(now2 - now1).count();} - replacement in src/hydra-queue-runner/build-remote.cc at line 374
auto now2 = std::chrono::steady_clock::now();/* Shut down the connection. */child.to.close();child.pid.wait(true); - replacement in src/hydra-queue-runner/build-remote.cc at line 378
result.overhead += std::chrono::duration_cast<std::chrono::milliseconds>(now2 - now1).count();} catch (Error & e) {/* Disable this machine until a certain period of time haspassed. This period increases on every consecutivefailure. However, don't count failures that occurred soonafter the last one (to take into account steps started inparallel). */auto info(machine->state->connectInfo.lock());auto now = std::chrono::system_clock::now();if (info->consecutiveFailures == 0 || info->lastFailure < now - std::chrono::seconds(30)) {info->consecutiveFailures = std::min(info->consecutiveFailures + 1, (unsigned int) 4);info->lastFailure = now;int delta = retryInterval * powf(retryBackoff, info->consecutiveFailures - 1) + (rand() % 30);printMsg(lvlInfo, format("will disable machine ‘%1%’ for %2%s") % machine->sshName % delta);info->disabledUntil = now + std::chrono::seconds(delta);}throw; - edit in src/hydra-queue-runner/build-remote.cc at line 395
/* Shut down the connection. */child.to.close();child.pid.wait(true);