-
Notifications
You must be signed in to change notification settings - Fork 217
Healthcheck Endpoint #843
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Healthcheck Endpoint #843
Changes from all commits
836f2f0
0c66a84
5f240aa
153ca32
8aefc73
828ff61
2feb895
2d1b5da
5abb3c4
01240fd
cb81244
b27700c
ecab323
fb7bdf1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -54,15 +54,19 @@ | |
| import io.mantisrx.master.events.LifecycleEventPublisher; | ||
| import io.mantisrx.master.events.LifecycleEventsProto; | ||
| import io.mantisrx.master.jobcluster.job.CostsCalculator; | ||
| import io.mantisrx.master.jobcluster.job.FilterableMantisWorkerMetadataWritable; | ||
| import io.mantisrx.master.jobcluster.job.IMantisJobMetadata; | ||
| import io.mantisrx.master.jobcluster.job.JobActor; | ||
| import io.mantisrx.master.jobcluster.job.JobHelper; | ||
| import io.mantisrx.master.jobcluster.job.JobState; | ||
| import io.mantisrx.master.jobcluster.job.MantisJobMetadataImpl; | ||
| import io.mantisrx.master.jobcluster.job.MantisJobMetadataView; | ||
| import io.mantisrx.master.jobcluster.job.worker.IMantisWorkerMetadata; | ||
| import io.mantisrx.master.jobcluster.proto.BaseResponse.ResponseCode; | ||
| import io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.UnreadyWorker; | ||
| import io.mantisrx.master.jobcluster.proto.JobClusterManagerProto; | ||
| import io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.DeleteJobClusterResponse; | ||
| import io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.HealthCheckRequest; | ||
| import io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.DisableJobClusterRequest; | ||
| import io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.DisableJobClusterResponse; | ||
| import io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.EnableJobClusterRequest; | ||
|
|
@@ -110,14 +114,17 @@ | |
| import io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.UpdateJobClusterWorkerMigrationStrategyResponse; | ||
| import io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.UpdateSchedulingInfoResponse; | ||
| import io.mantisrx.master.jobcluster.proto.JobClusterProto; | ||
| import io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.HealthCheckResponse; | ||
| import io.mantisrx.master.jobcluster.proto.JobClusterProto.JobStartedEvent; | ||
| import io.mantisrx.master.jobcluster.proto.JobClusterProto.KillJobRequest; | ||
| import io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.UnreadyWorkers; | ||
| import io.mantisrx.master.jobcluster.proto.JobClusterScalerRuleProto; | ||
| import io.mantisrx.master.jobcluster.proto.JobProto; | ||
| import io.mantisrx.master.jobcluster.scaler.IJobClusterScalerRuleData; | ||
| import io.mantisrx.master.jobcluster.scaler.JobClusterScalerRuleDataFactory; | ||
| import io.mantisrx.master.jobcluster.scaler.JobClusterScalerRuleDataImplWritable; | ||
| import io.mantisrx.runtime.JobSla; | ||
| import io.mantisrx.runtime.MantisJobState; | ||
| import io.mantisrx.runtime.command.InvalidJobException; | ||
| import io.mantisrx.server.core.JobCompletedReason; | ||
| import io.mantisrx.server.master.InvalidJobRequestException; | ||
|
|
@@ -617,6 +624,7 @@ private Receive buildInitializedBehavior() { | |
| new EnableJobClusterResponse(x.requestId, SUCCESS, genUnexpectedMsg(x.toString(), | ||
| this.name, state)), getSelf())) | ||
| .match(GetJobClusterRequest.class, this::onJobClusterGet) | ||
| .match(HealthCheckRequest.class, this::onHealthCheck) | ||
| .match(JobClusterProto.DeleteJobClusterRequest.class, this::onJobClusterDelete) | ||
| .match(ListArchivedWorkersRequest.class, this::onListArchivedWorkers) | ||
| .match(ListCompletedJobsInClusterRequest.class, this::onJobListCompleted) | ||
|
|
@@ -2633,6 +2641,58 @@ public void onJobScalerRuleStream(JobClusterScalerRuleProto.GetJobScalerRuleStre | |
| } | ||
| } | ||
|
|
||
| public void onHealthCheck(final HealthCheckRequest request) { | ||
| final ActorRef sender = getSender(); | ||
| final ActorRef self = getSelf(); | ||
|
|
||
| Set<JobId> prefilteredJobIds = new HashSet<>(); | ||
| if (request.jobIds != null && !request.jobIds.isEmpty()) { | ||
| request.jobIds.stream() | ||
| .map(JobId::fromId) | ||
| .filter(Optional::isPresent) | ||
| .map(Optional::get) | ||
| .forEach(prefilteredJobIds::add); | ||
| } | ||
|
|
||
| ListJobCriteria criteria = new ListJobCriteria(); | ||
| getFilteredNonTerminalJobList(criteria, prefilteredJobIds) | ||
| .collect(Lists::<UnreadyWorker>newArrayList, (unreadyWorkers, view) -> { | ||
| if (view.getWorkerMetadataList() != null) { | ||
| for (FilterableMantisWorkerMetadataWritable worker : view.getWorkerMetadataList()) { | ||
| if (worker.getState() != MantisJobState.Started) { | ||
| unreadyWorkers.add(new UnreadyWorker( | ||
| worker.getWorkerIndex(), | ||
| worker.getWorkerNumber(), | ||
| worker.getState().name())); | ||
|
Comment on lines
+2662
to
+2666
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is categorizing pending workers as failed?
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. imo the health logic is something like a composite of (all workers for each index have started, failed/resubmit workers count in past x duration is below a threshold; alert pack etc).
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, maybe I can improve the naming here, but the behavior is intentional, the idea is that after a deploy you can call this endpoint with n amount of retries until the workers have started. So if there are some stuck in accepted/pending the healthcheck will fail. Alternatively, since we're using SSE, we could just keep the connection open and send an event when the workers start or a timeout is reached. But I think the way its implemented is easier to build on top of. Edit: Didn't see your second comment until refreshing. Agreed on the composite, I'm just planning to add the rest of that within nfmantis.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe change the name to something like "notReady"?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. good call, went with unready |
||
| } | ||
| } | ||
| } | ||
| }) | ||
| .subscribe( | ||
| unreadyWorkers -> { | ||
| if (unreadyWorkers.isEmpty()) { | ||
| HealthCheckResponse response | ||
| = new HealthCheckResponse(request.requestId, ResponseCode.SUCCESS, "OK", true, null); | ||
| sender.tell(response, self); | ||
| } else { | ||
| HealthCheckResponse response | ||
| = new HealthCheckResponse( | ||
| request.requestId, | ||
| ResponseCode.SUCCESS, | ||
| "unready workers", | ||
| false, | ||
| new UnreadyWorkers(unreadyWorkers)); | ||
| sender.tell(response, self); | ||
| } | ||
| }, | ||
| error -> { | ||
| logger.error("Health check failed for cluster {}", name, error); | ||
| sender.tell(new JobClusterManagerProto.HealthCheckResponse(request.requestId, SERVER_ERROR, | ||
| "Health check failed: " + error.getMessage(), false, null), self); | ||
| } | ||
| ); | ||
| } | ||
|
|
||
| static final class JobInfo { | ||
|
|
||
| final long submittedAt; | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
what about we have users to provide the region, and we will determine the health for all jobs in that region?
here it seems we are expecting users to provide list of job ids, do we have use case where we only need inspect subset of jobs within a given region?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
By default, it checks any active jobs. The job-ids param is optional, it just does the default check if it's null. But if you do specify it, it only checks the job-ids that were inserted.
This probably won't be used right away, it's more of a forward looking change. In the future, where we have a fully managed Mantis CD system, this will allow us to check just jobs that were recently deployed. It's also an additional knob that gives some of the power users more flexibility.