Skip to content

Commit

Permalink
Fix restart HCAD detector bug (#460)
Browse files Browse the repository at this point in the history
* Fix restart HCAD detector bug

To prevent repeatedly cold starting a model due to sparse data, HCAD has a cache that remembers we have done cold start for a model. A second attempt to cold start will need to wait for 60 detector intervals. Previously, when stopping a detector, I forgot to clean the cache. So the cache remembers the model and won’t retry cold start after some time. This PR fixes the bug by cleaning the cache when stopping a detector.

Testing done:
1. added unit and integration tests.
2. manually reproduced the issue and verified the fix.

Signed-off-by: Kaituo Li <kaituo@amazon.com>
(cherry picked from commit 9dd9718)
  • Loading branch information
kaituo authored and github-actions[bot] committed Mar 23, 2022
1 parent bf8f2da commit a9d3be9
Show file tree
Hide file tree
Showing 5 changed files with 407 additions and 25 deletions.
8 changes: 7 additions & 1 deletion src/main/java/org/opensearch/ad/ml/EntityColdStarter.java
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
import org.opensearch.action.ActionListener;
import org.opensearch.action.support.ThreadedActionListener;
import org.opensearch.ad.AnomalyDetectorPlugin;
import org.opensearch.ad.CleanState;
import org.opensearch.ad.MaintenanceState;
import org.opensearch.ad.NodeStateManager;
import org.opensearch.ad.caching.DoorKeeper;
Expand All @@ -63,7 +64,7 @@
* Training models for HCAD detectors
*
*/
public class EntityColdStarter implements MaintenanceState {
public class EntityColdStarter implements MaintenanceState, CleanState {
private static final Logger logger = LogManager.getLogger(EntityColdStarter.class);
private final Clock clock;
private final ThreadPool threadPool;
Expand Down Expand Up @@ -743,4 +744,9 @@ public void maintenance() {
}
});
}

@Override
public void clear(String detectorId) {
doorKeepers.remove(detectorId);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import org.opensearch.ad.NodeStateManager;
import org.opensearch.ad.caching.CacheProvider;
import org.opensearch.ad.feature.FeatureManager;
import org.opensearch.ad.ml.EntityColdStarter;
import org.opensearch.ad.ml.ModelManager;
import org.opensearch.ad.task.ADTaskCacheManager;
import org.opensearch.cluster.service.ClusterService;
Expand All @@ -39,6 +40,7 @@ public class DeleteModelTransportAction extends
private FeatureManager featureManager;
private CacheProvider cache;
private ADTaskCacheManager adTaskCacheManager;
private EntityColdStarter coldStarter;

@Inject
public DeleteModelTransportAction(
Expand All @@ -50,7 +52,8 @@ public DeleteModelTransportAction(
ModelManager modelManager,
FeatureManager featureManager,
CacheProvider cache,
ADTaskCacheManager adTaskCacheManager
ADTaskCacheManager adTaskCacheManager,
EntityColdStarter coldStarter
) {
super(
DeleteModelAction.NAME,
Expand All @@ -68,6 +71,7 @@ public DeleteModelTransportAction(
this.featureManager = featureManager;
this.cache = cache;
this.adTaskCacheManager = adTaskCacheManager;
this.coldStarter = coldStarter;
}

@Override
Expand Down Expand Up @@ -121,6 +125,8 @@ protected DeleteModelNodeResponse nodeOperation(DeleteModelNodeRequest request)

cache.get().clear(adID);

coldStarter.clear(adID);

// delete realtime task cache
adTaskCacheManager.removeRealtimeTaskCache(adID);

Expand Down
Loading

0 comments on commit a9d3be9

Please sign in to comment.